2121 * DEALINGS IN THE SOFTWARE.
2222 */
2323
24+ /*
25+ * The comments following this one that use the same comment syntax as this
26+ * comment are quotes from the HTML Standard at https://html.spec.whatwg.org/
27+ * as of 10 September 2020. That document came with this statement:
28+ * Copyright © WHATWG (Apple, Google, Mozilla, Microsoft). This work is
29+ * licensed under a Creative Commons Attribution 4.0 International License.
30+ */
31+
2432package nu .validator .htmlparser .io ;
2533
2634import java .io .IOException ;
@@ -214,9 +222,8 @@ public void tokenize(InputSource is, int bufferSize)
214222 tokenizer .getErrorHandler (), tokenizer , this , heuristics );
215223 } else {
216224 if (this .characterEncoding != Encoding .UTF8 ) {
217- errorWithoutLocation ("Legacy encoding \u201C "
218- + this .characterEncoding .getCanonName ()
219- + "\u201D used. Documents must use UTF-8." );
225+ errorWithoutLocation (Encoding .msgLegacyEncoding (
226+ this .characterEncoding .getCanonName ()));
220227 }
221228 becomeConfident ();
222229 this .reader = new HtmlInputStreamReader (inputStream ,
@@ -350,50 +357,92 @@ public void setEncoding(Encoding encoding, Confidence confidence) {
350357 }
351358 }
352359
360+ private void errInternalActualDiffer (String internalCharset , String actual )
361+ throws SAXException {
362+ if (!internalCharset .equals (actual )) {
363+ tokenizer .errTreeBuilder (
364+ "Ignoring internal encoding declaration \u201C "
365+ + internalCharset + "\u201D , which disagrees with"
366+ + " the actual encoding of the document (\u201C "
367+ + actual + "\u201D )." );
368+ }
369+ }
370+
353371 public boolean internalEncodingDeclaration (String internalCharset )
354372 throws SAXException {
373+ String actual = characterEncoding .getCanonName ();
374+ if (confidence == Confidence .CERTAIN ) {
375+ errInternalActualDiffer (internalCharset , actual );
376+ return true ;
377+ }
378+ /* https://html.spec.whatwg.org/#changing-the-encoding-while-parsing */
355379 try {
380+ if ("utf-16be" .equals (actual ) || "utf-16le" .equals (actual )) {
381+ errInternalActualDiffer (internalCharset , actual );
382+ /*
383+ * 1. If the encoding that is already being used to interpret
384+ * the input stream is a UTF-16 encoding, then set the
385+ * confidence to certain and return. The new encoding is ignored
386+ * becomeConfident();
387+ */
388+ return true ;
389+ }
356390 internalCharset = internalCharset .toLowerCase ();
357391 Encoding cs = Encoding .forName (internalCharset );
358392 if ("utf-16be" .equals (internalCharset )
359393 || "utf-16le" .equals (internalCharset )) {
360- tokenizer .errTreeBuilder ("Internal encoding declaration specified \u201C "
361- + internalCharset
362- + "\u201D which is not an ASCII superset. Continuing as if the encoding had been \u201C utf-8\u201D ." );
394+ /*
395+ * 2. If the new encoding is a UTF-16 encoding, then change it
396+ * to UTF-8.
397+ */
398+ tokenizer .errTreeBuilder (
399+ Encoding .msgIgnoredCharset (internalCharset , "utf-8" ));
363400 cs = Encoding .UTF8 ;
364401 internalCharset = "utf-8" ;
365- } else {
366- cs = Encoding .forName (internalCharset );
367- }
368- Encoding actual = cs .getActualHtmlEncoding ();
369- if (actual == null ) {
370- actual = cs ;
402+ } else if ("x-user-defined" .equals (internalCharset )) {
403+ /*
404+ * 3. If the new encoding is x-user-defined, then change it to
405+ * windows-1252.
406+ */
407+ tokenizer .errTreeBuilder (Encoding .msgIgnoredCharset (
408+ "x-user-defined" , "windows-1252" ));
409+ cs = Encoding .WINDOWS1252 ;
410+ internalCharset = "windows-1252" ;
371411 }
372412 if (characterEncoding == null ) {
373413 // Reader case
374414 return true ;
375415 }
376- if (characterEncoding == actual ) {
416+ if (characterEncoding == cs ) {
417+ /*
418+ * 4. If the new encoding is identical or equivalent to the
419+ * encoding that is already being used to interpret the input
420+ * stream, then set the confidence to certain and return.
421+ */
377422 becomeConfident ();
378423 return true ;
379424 }
380- if (confidence == Confidence .CERTAIN && actual != characterEncoding ) {
381- tokenizer .errTreeBuilder ("Internal encoding declaration \u201C "
382- + internalCharset
383- + "\u201D disagrees with the actual encoding of the document (\u201C "
384- + characterEncoding .getCanonName () + "\u201D )." );
385- } else {
386- Encoding newEnc = whineAboutEncodingAndReturnCanonical (
387- internalCharset , cs );
388- tokenizer .errTreeBuilder ("Changing character encoding \u201C "
389- + internalCharset + "\u201D and reparsing." );
390- characterEncoding = newEnc ;
391- throw new ReparseException ();
392- }
393- return true ;
425+ /*
426+ * 6. Otherwise, navigate to the document again, with
427+ * historyHandling set to "replace", and using the same source
428+ * browsing context, but this time skip the encoding sniffing
429+ * algorithm and instead just set the encoding to the new encoding
430+ */
431+ Encoding newEnc = whineAboutEncodingAndReturnCanonical (
432+ internalCharset , cs );
433+ tokenizer .errTreeBuilder ("Changing character encoding to \u201C "
434+ + internalCharset + "\u201D and reparsing." );
435+ characterEncoding = newEnc ;
436+ // Note: We intentionally don’t call becomeConfident() at this
437+ // point. If we did, it would end up causing the exception
438+ // java.lang.IllegalStateException: rewind() after willNotRewind()
439+ // to be thrown later. So we are departing here from strictly
440+ // following the ordering in the corresponding spec language, which
441+ // specifies setting the confidence to "certain" at this point.
442+ throw new ReparseException ();
394443 } catch (UnsupportedCharsetException e ) {
395- tokenizer .errTreeBuilder ("Internal encoding declaration named an unsupported chararacter encoding \u201C "
396- + internalCharset + " \u201D ." );
444+ tokenizer .errTreeBuilder (
445+ Encoding . msgBadInternalCharset ( internalCharset ) );
397446 return false ;
398447 }
399448 }
@@ -453,8 +502,7 @@ protected Encoding encodingFromExternalDeclaration(String encoding)
453502 }
454503 return whineAboutEncodingAndReturnCanonical (encoding , cs );
455504 } catch (UnsupportedCharsetException e ) {
456- tokenizer .err ("Unsupported character encoding name: \u201C " + encoding
457- + "\u201D . Will sniff." );
505+ tokenizer .err (Encoding .msgBadEncoding (encoding ) + " Will sniff." );
458506 swallowBom = true ;
459507 }
460508 return null ; // keep the compiler happy
@@ -470,7 +518,7 @@ protected Encoding whineAboutEncodingAndReturnCanonical(String encoding,
470518 Encoding cs ) throws SAXException {
471519 String canonName = cs .getCanonName ();
472520 if (!canonName .equals (encoding )) {
473- tokenizer .err (Encoding .msgNotPreferredName (encoding , canonName ));
521+ tokenizer .err (Encoding .msgNotCanonicalName (encoding , canonName ));
474522 }
475523 return cs ;
476524 }
0 commit comments