All files / src/compiler/phases/1-parse/utils html.js

86.3% Statements 145/168
86.66% Branches 26/30
100% Functions 5/5
86.14% Lines 143/166

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 1672x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 62468x 62468x 62468x 62468x 1484x 1484x 60984x 60984x 2x 2x 28x 28x 28x 28x 28x 28x 28x 28x 28x 28x 2x 2x 2x 2x 2x 2x 2x 2x 2x 24061x 24061x 24061x 24061x 24061x 24061x 24061x 101x 101x 101x 101x 79x 101x 6x 22x 16x 16x 101x 101x     101x 101x 101x 24061x 24061x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 101x 101x 101x 2x 2x 99x 99x 101x 49x 49x 50x 50x 50x 101x 6x 6x 44x 44x 44x 44x 44x                         101x         101x           2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 2x 84x 84x 2x 2x 2x 2x 2x 2x 2x 2x 2x 1199x 228x 4x 4x 228x 1195x 1195x 1195x  
import { interactive_elements } from '../../../../constants.js';
import entities from './entities.js';
 
const windows_1252 = [
	8364, 129, 8218, 402, 8222, 8230, 8224, 8225, 710, 8240, 352, 8249, 338, 141, 381, 143, 144, 8216,
	8217, 8220, 8221, 8226, 8211, 8212, 732, 8482, 353, 8250, 339, 157, 382, 376
];
 
/**
 * @param {string} entity_name
 * @param {boolean} is_attribute_value
 */
function reg_exp_entity(entity_name, is_attribute_value) {
	// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
	// doesn't decode the html entity which not ends with ; and next character is =, number or alphabet in attribute value.
	if (is_attribute_value && !entity_name.endsWith(';')) {
		return `${entity_name}\\b(?!=)`;
	}
	return entity_name;
}
 
/** @param {boolean} is_attribute_value */
function get_entity_pattern(is_attribute_value) {
	const reg_exp_num = '#(?:x[a-fA-F\\d]+|\\d+)(?:;)?';
	const reg_exp_entities = Object.keys(entities).map(
		/** @param {any} entity_name */ (entity_name) => reg_exp_entity(entity_name, is_attribute_value)
	);
 
	const entity_pattern = new RegExp(`&(${reg_exp_num}|${reg_exp_entities.join('|')})`, 'g');
 
	return entity_pattern;
}
 
const entity_pattern_content = get_entity_pattern(false);
const entity_pattern_attr_value = get_entity_pattern(true);
 
/**
 * @param {string} html
 * @param {boolean} is_attribute_value
 */
export function decode_character_references(html, is_attribute_value) {
	const entity_pattern = is_attribute_value ? entity_pattern_attr_value : entity_pattern_content;
	return html.replace(
		entity_pattern,
		/**
		 * @param {any} match
		 * @param {keyof typeof entities} entity
		 */ (match, entity) => {
			let code;
 
			// Handle named entities
			if (entity[0] !== '#') {
				code = entities[entity];
			} else if (entity[1] === 'x') {
				code = parseInt(entity.substring(2), 16);
			} else {
				code = parseInt(entity.substring(1), 10);
			}
 
			if (!code) {
				return match;
			}
 
			return String.fromCodePoint(validate_code(code));
		}
	);
}
 
const NUL = 0;
 
// some code points are verboten. If we were inserting HTML, the browser would replace the illegal
// code points with alternatives in some cases - since we're bypassing that mechanism, we need
// to replace them ourselves
//
// Source: http://en.wikipedia.org/wiki/Character_encodings_in_HTML#Illegal_characters
 
/** @param {number} code */
function validate_code(code) {
	// line feed becomes generic whitespace
	if (code === 10) {
		return 32;
	}
 
	// ASCII range. (Why someone would use HTML entities for ASCII characters I don't know, but...)
	if (code < 128) {
		return code;
	}
 
	// code points 128-159 are dealt with leniently by browsers, but they're incorrect. We need
	// to correct the mistake or we'll end up with missing € signs and so on
	if (code <= 159) {
		return windows_1252[code - 128];
	}
 
	// basic multilingual plane
	if (code < 55296) {
		return code;
	}

	// UTF-16 surrogate halves
	if (code <= 57343) {
		return NUL;
	}

	// rest of the basic multilingual plane
	if (code <= 65535) {
		return code;
	}

	// supplementary multilingual plane 0x10000 - 0x1ffff
	if (code >= 65536 && code <= 131071) {
		return code;
	}

	// supplementary ideographic plane 0x20000 - 0x2ffff
	if (code >= 131072 && code <= 196607) {
		return code;
	}

	return NUL;
}
 
// based on http://developers.whatwg.org/syntax.html#syntax-tag-omission
 
/** @type {Record<string, Set<string>>} */
const disallowed_contents = {
	li: new Set(['li']),
	dt: new Set(['dt', 'dd']),
	dd: new Set(['dt', 'dd']),
	p: new Set(
		'address article aside blockquote div dl fieldset footer form h1 h2 h3 h4 h5 h6 header hgroup hr main menu nav ol p pre section table ul'.split(
			' '
		)
	),
	rt: new Set(['rt', 'rp']),
	rp: new Set(['rt', 'rp']),
	optgroup: new Set(['optgroup']),
	option: new Set(['option', 'optgroup']),
	thead: new Set(['tbody', 'tfoot']),
	tbody: new Set(['tbody', 'tfoot']),
	tfoot: new Set(['tbody']),
	tr: new Set(['tr', 'tbody']),
	td: new Set(['td', 'th', 'tr']),
	th: new Set(['td', 'th', 'tr'])
};
 
for (const interactive_element of interactive_elements) {
	disallowed_contents[interactive_element] = interactive_elements;
}
 
// can this be a child of the parent element, or does it implicitly
// close it, like `<li>one<li>two`?
 
/**
 * @param {string} current
 * @param {string} [next]
 */
export function closing_tag_omitted(current, next) {
	if (disallowed_contents[current]) {
		if (!next || disallowed_contents[current].has(next)) {
			return true;
		}
	}
 
	return false;
}