diff --git a/HTMLparser.c b/HTMLparser.c index 4d43b93d..1a4d80d3 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -3448,7 +3448,8 @@ static void htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { const xmlChar *encoding; - if ((ctxt == NULL) || (attvalue == NULL)) + if ((ctxt == NULL) || (attvalue == NULL) || + (ctxt->options & HTML_PARSE_IGNORE_ENC)) return; /* do not change encoding */ @@ -3500,7 +3501,9 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { xmlSwitchToEncoding(ctxt, handler); ctxt->charset = XML_CHAR_ENCODING_UTF8; } else { - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; + htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, + "htmlCheckEncoding: unknown encoding %s\n", + encoding, NULL); } } @@ -6537,6 +6540,10 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) ctxt->options |= HTML_PARSE_NODEFDTD; options -= HTML_PARSE_NODEFDTD; } + if (options & HTML_PARSE_IGNORE_ENC) { + ctxt->options |= HTML_PARSE_IGNORE_ENC; + options -= HTML_PARSE_IGNORE_ENC; + } ctxt->dictNames = 0; return (options); } diff --git a/HTMLtree.c b/HTMLtree.c index b5085836..f23ae023 100644 --- a/HTMLtree.c +++ b/HTMLtree.c @@ -481,7 +481,7 @@ htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, if (enc != XML_CHAR_ENCODING_UTF8) { handler = xmlFindCharEncodingHandler(encoding); if (handler == NULL) - return(-1); + htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); } } @@ -562,11 +562,9 @@ htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) { } handler = xmlFindCharEncodingHandler(encoding); - if (handler == NULL) { - *mem = NULL; - *size = 0; - return; - } + if (handler == NULL) + htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); + } else { handler = xmlFindCharEncodingHandler(encoding); } @@ -587,7 +585,7 @@ htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) { return; } - htmlDocContentDumpFormatOutput(buf, cur, NULL, format); + htmlDocContentDumpFormatOutput(buf, cur, NULL, format); xmlOutputBufferFlush(buf); if (buf->conv != NULL) { @@ -1061,7 +1059,7 @@ htmlDocDump(FILE *f, xmlDocPtr cur) { handler = xmlFindCharEncodingHandler(encoding); if (handler == NULL) - return(-1); + htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); } else { handler = xmlFindCharEncodingHandler(encoding); } @@ -1120,7 +1118,7 @@ htmlSaveFile(const char *filename, xmlDocPtr cur) { handler = xmlFindCharEncodingHandler(encoding); if (handler == NULL) - return(-1); + htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); } } @@ -1181,7 +1179,7 @@ htmlSaveFileFormat(const char *filename, xmlDocPtr cur, handler = xmlFindCharEncodingHandler(encoding); if (handler == NULL) - return(-1); + htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); } htmlSetMetaEncoding(cur, (const xmlChar *) encoding); } else { diff --git a/include/libxml/HTMLparser.h b/include/libxml/HTMLparser.h index fbcc811e..10a3d654 100644 --- a/include/libxml/HTMLparser.h +++ b/include/libxml/HTMLparser.h @@ -184,7 +184,8 @@ typedef enum { HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */ HTML_PARSE_NONET = 1<<11,/* Forbid network access */ HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */ - HTML_PARSE_COMPACT = 1<<16 /* compact small text nodes */ + HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */ + HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */ } htmlParserOption; XMLPUBFUN void XMLCALL diff --git a/include/libxml/parser.h b/include/libxml/parser.h index 47b3df15..aabb96cf 100644 --- a/include/libxml/parser.h +++ b/include/libxml/parser.h @@ -1105,8 +1105,9 @@ typedef enum { crash if you try to modify the tree) */ XML_PARSE_OLD10 = 1<<17,/* parse using XML-1.0 before update 5 */ XML_PARSE_NOBASEFIX = 1<<18,/* do not fixup XINCLUDE xml:base uris */ - XML_PARSE_HUGE = 1<<19, /* relax any hardcoded limit from the parser */ - XML_PARSE_OLDSAX = 1<<20 /* parse using SAX2 interface from before 2.7.0 */ + XML_PARSE_HUGE = 1<<19,/* relax any hardcoded limit from the parser */ + XML_PARSE_OLDSAX = 1<<20,/* parse using SAX2 interface before 2.7.0 */ + XML_PARSE_IGNORE_ENC= 1<<21 /* ignore internal document encoding hint */ } xmlParserOption; XMLPUBFUN void XMLCALL diff --git a/parser.c b/parser.c index 9ab86412..02a18771 100644 --- a/parser.c +++ b/parser.c @@ -9922,6 +9922,13 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) { } else { xmlFatalErr(ctxt, XML_ERR_STRING_NOT_STARTED, NULL); } + + /* + * Non standard parsing, allowing the user to ignore encoding + */ + if (ctxt->options & XML_PARSE_IGNORE_ENC) + return(encoding); + /* * UTF-16 encoding stwich has already taken place at this stage, * more over the little-endian/big-endian selection is already done @@ -14561,6 +14568,10 @@ xmlCtxtUseOptionsInternal(xmlParserCtxtPtr ctxt, int options, const char *encodi ctxt->options |= XML_PARSE_OLDSAX; options -= XML_PARSE_OLDSAX; } + if (options & XML_PARSE_IGNORE_ENC) { + ctxt->options |= XML_PARSE_IGNORE_ENC; + options -= XML_PARSE_IGNORE_ENC; + } ctxt->linenumbers = 1; return (options); } diff --git a/xmllint.c b/xmllint.c index 8af56cd4..14c64bec 100644 --- a/xmllint.c +++ b/xmllint.c @@ -130,6 +130,7 @@ static int copy = 0; #endif /* LIBXML_TREE_ENABLED */ static int recovery = 0; static int noent = 0; +static int noenc = 0; static int noblanks = 0; static int noout = 0; static int nowrap = 0; @@ -2983,6 +2984,7 @@ static void usage(const char *name) { printf("\t--recover : output what was parsable on broken XML documents\n"); printf("\t--huge : remove any internal arbitrary parser limits\n"); printf("\t--noent : substitute entity references by their value\n"); + printf("\t--noenc : ignore any encoding specified inside the document\n"); printf("\t--noout : don't output the result tree\n"); printf("\t--path 'paths': provide a set of paths for resources\n"); printf("\t--load-trace : print trace of all external entites loaded\n"); @@ -3137,6 +3139,10 @@ main(int argc, char **argv) { (!strcmp(argv[i], "--noent"))) { noent++; options |= XML_PARSE_NOENT; + } else if ((!strcmp(argv[i], "-noenc")) || + (!strcmp(argv[i], "--noenc"))) { + noenc++; + options |= XML_PARSE_IGNORE_ENC; } else if ((!strcmp(argv[i], "-nsclean")) || (!strcmp(argv[i], "--nsclean"))) { options |= XML_PARSE_NSCLEAN;