Skip to content

Commit 92fff5c

Browse files
authored
Merge pull request #163 from tgalopin/charset-support
Allow to pass a charset to the Scanner
2 parents ca7c31b + 44ff776 commit 92fff5c

File tree

5 files changed

+85
-24
lines changed

5 files changed

+85
-24
lines changed

src/HTML5.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ public function parse($input, array $options = array())
155155
$this->errors = array();
156156
$options = array_merge($this->defaultOptions, $options);
157157
$events = new DOMTreeBuilder(false, $options);
158-
$scanner = new Scanner($input);
158+
$scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8');
159159
$parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML);
160160

161161
$parser->parse();
@@ -179,7 +179,7 @@ public function parseFragment($input, array $options = array())
179179
{
180180
$options = array_merge($this->defaultOptions, $options);
181181
$events = new DOMTreeBuilder(true, $options);
182-
$scanner = new Scanner($input);
182+
$scanner = new Scanner($input, !empty($options['encoding']) ? $options['encoding'] : 'UTF-8');
183183
$parser = new Tokenizer($scanner, $events, !empty($options['xmlNamespaces']) ? Tokenizer::CONFORMANT_XML : Tokenizer::CONFORMANT_HTML);
184184

185185
$parser->parse();

src/HTML5/Parser/UTF8Utils.php

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@
33
namespace Masterminds\HTML5\Parser;
44

55
/*
6-
*
7-
* Portions based on code from html5lib files with the following copyright:
6+
Portions based on code from html5lib files with the following copyright:
87
98
Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
109
@@ -26,28 +25,20 @@
2625
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
2726
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
2827
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
29-
3028
*/
3129

3230
use Masterminds\HTML5\Exception;
3331

34-
/**
35-
* UTF-8 Utilities.
36-
*/
3732
class UTF8Utils
3833
{
3934
/**
40-
* The Unicode replacement character..
35+
* The Unicode replacement character.
4136
*/
4237
const FFFD = "\xEF\xBF\xBD";
4338

4439
/**
4540
* Count the number of characters in a string.
46-
*
47-
* UTF-8 aware. This will try (in order) iconv,
48-
* MB, libxml, and finally a custom counter.
49-
*
50-
* @todo Move this to a general utility class.
41+
* UTF-8 aware. This will try (in order) iconv, MB, libxml, and finally a custom counter.
5142
*
5243
* @param string $string
5344
*
@@ -58,14 +49,20 @@ public static function countChars($string)
5849
// Get the length for the string we need.
5950
if (function_exists('mb_strlen')) {
6051
return mb_strlen($string, 'utf-8');
61-
} elseif (function_exists('iconv_strlen')) {
52+
}
53+
54+
if (function_exists('iconv_strlen')) {
6255
return iconv_strlen($string, 'utf-8');
63-
} elseif (function_exists('utf8_decode')) {
56+
}
57+
58+
if (function_exists('utf8_decode')) {
6459
// MPB: Will this work? Won't certain decodes lead to two chars
6560
// extrapolated out of 2-byte chars?
6661
return strlen(utf8_decode($string));
6762
}
63+
6864
$count = count_chars($string);
65+
6966
// 0x80 = 0x7F - 0 + 1 (one added to get inclusive range)
7067
// 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range)
7168
return array_sum(array_slice($count, 0, 0x80)) + array_sum(array_slice($count, 0xC2, 0x33));
@@ -85,7 +82,12 @@ public static function countChars($string)
8582
public static function convertToUTF8($data, $encoding = 'UTF-8')
8683
{
8784
/*
88-
* From the HTML5 spec: Given an encoding, the bytes in the input stream must be converted to Unicode characters for the tokeniser, as described by the rules for that encoding, except that the leading U+FEFF BYTE ORDER MARK character, if any, must not be stripped by the encoding layer (it is stripped by the rule below). Bytes or sequences of bytes in the original byte stream that could not be converted to Unicode characters must be converted to U+FFFD REPLACEMENT CHARACTER code points.
85+
* From the HTML5 spec: Given an encoding, the bytes in the input stream must be converted
86+
* to Unicode characters for the tokeniser, as described by the rules for that encoding,
87+
* except that the leading U+FEFF BYTE ORDER MARK character, if any, must not be stripped
88+
* by the encoding layer (it is stripped by the rule below). Bytes or sequences of bytes
89+
* in the original byte stream that could not be converted to Unicode characters must be
90+
* converted to U+FFFD REPLACEMENT CHARACTER code points.
8991
*/
9092

9193
// mb_convert_encoding is chosen over iconv because of a bug. The best
@@ -106,7 +108,8 @@ public static function convertToUTF8($data, $encoding = 'UTF-8')
106108
mb_substitute_character('none');
107109
$data = mb_convert_encoding($data, 'UTF-8', $encoding);
108110
mb_substitute_character($save);
109-
} // @todo Get iconv running in at least some environments if that is possible.
111+
}
112+
// @todo Get iconv running in at least some environments if that is possible.
110113
elseif (function_exists('iconv') && 'auto' !== $encoding) {
111114
// fprintf(STDOUT, "iconv found\n");
112115
// iconv has the following behaviors:
@@ -141,14 +144,20 @@ public static function checkForIllegalCodepoints($data)
141144
$errors = array();
142145

143146
/*
144-
* All U+0000 null characters in the input must be replaced by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such characters is a parse error.
147+
* All U+0000 null characters in the input must be replaced by U+FFFD REPLACEMENT CHARACTERs.
148+
* Any occurrences of such characters is a parse error.
145149
*/
146150
for ($i = 0, $count = substr_count($data, "\0"); $i < $count; ++$i) {
147151
$errors[] = 'null-character';
148152
}
149153

150154
/*
151-
* Any occurrences of any characters in the ranges U+0001 to U+0008, U+000B, U+000E to U+001F, U+007F to U+009F, U+D800 to U+DFFF , U+FDD0 to U+FDEF, and characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and U+10FFFF are parse errors. (These are all control characters or permanently undefined Unicode characters.)
155+
* Any occurrences of any characters in the ranges U+0001 to U+0008, U+000B, U+000E to U+001F, U+007F
156+
* to U+009F, U+D800 to U+DFFF , U+FDD0 to U+FDEF, and characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF,
157+
* U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE,
158+
* U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF,
159+
* U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and U+10FFFF are parse errors.
160+
* (These are all control characters or permanently undefined Unicode characters.)
152161
*/
153162
// Check PCRE is loaded.
154163
$count = preg_match_all(
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<!doctype html>
2+
<html>
3+
<head>
4+
<meta http-equiv="content-type" content="text/html;charset=utf-8" />
5+
</head>
6+
<body>
7+
<p>Žťčýů</p>
8+
</body>
9+
</html>
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
<!doctype html>
2+
<html>
3+
<head>
4+
<meta http-equiv="content-type" content="text/html;charset=windows-1252">
5+
</head>
6+
<body>
7+
<p>Žèýù</p>
8+
</body>
9+
</html>

test/HTML5/Html5Test.php

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,15 @@
22

33
namespace Masterminds\HTML5\Tests;
44

5+
use Masterminds\HTML5;
6+
57
class Html5Test extends TestCase
68
{
9+
/**
10+
* @var HTML5
11+
*/
12+
private $html5;
13+
714
public function setUp()
815
{
916
$this->html5 = $this->getInstance();
@@ -50,8 +57,8 @@ public function testLoadOptions()
5057
{
5158
// doc
5259
$dom = $this->html5->loadHTML($this->wrap('<t:tag/>'), array(
53-
'implicitNamespaces' => array('t' => 'http://example.com'),
54-
'xmlNamespaces' => true,
60+
'implicitNamespaces' => array('t' => 'http://example.com'),
61+
'xmlNamespaces' => true,
5562
));
5663
$this->assertInstanceOf('\DOMDocument', $dom);
5764
$this->assertEmpty($this->html5->getErrors());
@@ -63,8 +70,8 @@ public function testLoadOptions()
6370

6471
// doc fragment
6572
$frag = $this->html5->loadHTMLFragment('<t:tag/>', array(
66-
'implicitNamespaces' => array('t' => 'http://example.com'),
67-
'xmlNamespaces' => true,
73+
'implicitNamespaces' => array('t' => 'http://example.com'),
74+
'xmlNamespaces' => true,
6875
));
6976
$this->assertInstanceOf('\DOMDocumentFragment', $frag);
7077
$this->assertEmpty($this->html5->getErrors());
@@ -76,6 +83,33 @@ public function testLoadOptions()
7683
$this->assertEquals(1, $xpath->query('//t:tag', $frag)->length);
7784
}
7885

86+
public function testEncodingUtf8()
87+
{
88+
$dom = $this->html5->load(__DIR__ . '/Fixtures/encoding/utf-8.html');
89+
$this->assertInstanceOf('\DOMDocument', $dom);
90+
$this->assertEmpty($this->html5->getErrors());
91+
$this->assertFalse($this->html5->hasErrors());
92+
93+
$this->assertContains('Žťčýů', $dom->saveHTML());
94+
}
95+
96+
public function testEncodingWindows1252()
97+
{
98+
$dom = $this->html5->load(__DIR__ . '/Fixtures/encoding/windows-1252.html', array(
99+
'encoding' => 'Windows-1252',
100+
));
101+
102+
$this->assertInstanceOf('\DOMDocument', $dom);
103+
$this->assertEmpty($this->html5->getErrors());
104+
$this->assertFalse($this->html5->hasErrors());
105+
106+
$dumpedAsUtf8 = mb_convert_encoding($dom->saveHTML(), 'UTF-8', 'Windows-1252');
107+
$this->assertNotFalse(mb_strpos($dumpedAsUtf8, 'Ž'));
108+
$this->assertNotFalse(mb_strpos($dumpedAsUtf8, 'è'));
109+
$this->assertNotFalse(mb_strpos($dumpedAsUtf8, 'ý'));
110+
$this->assertNotFalse(mb_strpos($dumpedAsUtf8, 'ù'));
111+
}
112+
79113
public function testErrors()
80114
{
81115
$dom = $this->html5->loadHTML('<xx as>');

0 commit comments

Comments
 (0)