Skip to content

Commit f192c84

Browse files
roukmoutemheap
authored andcommitted
Use complete GSM7 character set + refactor unicode detection method (#137)
1 parent 2c39e4a commit f192c84

File tree

2 files changed

+28
-20
lines changed

2 files changed

+28
-20
lines changed

src/Message/EncodingDetector.php

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,29 +12,26 @@ class EncodingDetector {
1212

1313
public function requiresUnicodeEncoding($content)
1414
{
15-
16-
$gsmCodePoints = [
17-
0x0040, 0x00A3, 0x0024, 0x00A5, 0x00E8, 0x00E9, 0x00F9, 0x00EC, 0x00F2, 0x00E7, 0x000A, 0x00D8, 0x00F8, 0x000D, 0x00C5, 0x00E5, 0x0394,
18-
0x005F, 0x03A6, 0x0393, 0x039B, 0x03A9, 0x03A0, 0x03A8, 0x03A3, 0x0398, 0x039E, 0x00A0, 0x000C, 0x005E, 0x007B, 0x007D, 0x005C, 0x005B,
19-
0x007E, 0x005D, 0x007C, 0x20AC, 0x00C6, 0x00E6, 0x00DF, 0x00C9, 0x0020, 0x0021, 0x0022, 0x0023, 0x00A4, 0x0025, 0x0026, 0x0027, 0x0028,
20-
0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039,
21-
0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, 0x00A1, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004A,
22-
0x004B, 0x004C, 0x004D, 0x004E, 0x004F, 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005A, 0x00C4,
23-
0x00D6, 0x00D1, 0x00DC, 0x00A7, 0x00BF, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006A, 0x006B, 0x006C,
24-
0x006D, 0x006E, 0x006F, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007A, 0x00E4, 0x00F6, 0x00F1,
25-
0x00FC, 0x00E0
26-
];
15+
$gsmCodePoints = array_map(
16+
$this->convertIntoUnicode(),
17+
[ // See: https://en.wikipedia.org/wiki/GSM_03.38#GSM_7-bit_default_alphabet_and_extension_table_of_3GPP_TS_23.038_/_GSM_03.38
18+
'@', '£', '$', '¥', 'è', 'é', 'ù', 'ì', 'ò', 'ç', "\r", 'Ø', 'ø', "\n", 'Å', 'å',
19+
'Δ', '_', 'Φ', 'Γ', 'Λ', 'Ω', 'Π', 'Ψ', 'Σ', 'Θ', 'Ξ', 'Æ', 'æ', 'ß', 'É',
20+
' ', '!', '"', '#', '¤', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
21+
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?',
22+
'¡', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
23+
'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'Ä', 'Ö', 'Ñ', 'Ü', '§',
24+
'¿', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
25+
'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'ö', 'ñ', 'ü', 'à',
26+
"\f", '^', '{', '}', '\\', '[', '~', ']', '|', '',
27+
]
28+
);
2729

2830
// Split $text into an array in a way that respects multibyte characters.
2931
$textChars = preg_split('//u', $content, null, PREG_SPLIT_NO_EMPTY);
3032

3133
// Array of codepoint values for characters in $text.
32-
$textCodePoints = array_map(function ($char) {
33-
$k = mb_convert_encoding($char, 'UTF-16LE', 'UTF-8');
34-
$k1 = ord(substr($k, 0, 1));
35-
$k2 = ord(substr($k, 1, 1));
36-
return $k2 * 256 + $k1;
37-
}, $textChars);
34+
$textCodePoints = array_map($this->convertIntoUnicode(), $textChars);
3835

3936
// Filter the array to contain only codepoints from $text that are not in the set of valid GSM codepoints.
4037
$nonGsmCodePoints = array_diff($textCodePoints, $gsmCodePoints);
@@ -43,6 +40,14 @@ public function requiresUnicodeEncoding($content)
4340
return !empty($nonGsmCodePoints);
4441
}
4542

43+
private function convertIntoUnicode()
44+
{
45+
return function ($char) {
46+
$k = mb_convert_encoding($char, 'UTF-16LE', 'UTF-8');
47+
$k1 = ord(substr($k, 0, 1));
48+
$k2 = ord(substr($k, 1, 1));
4649

50+
return $k2 * 256 + $k1;
51+
};
52+
}
4753
}
48-

test/Message/EncodingDetectorTest.php

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ public function unicodeProvider() {
3131
$r['german'] = ['Heizölrückstoßabdämpfung', false];
3232
$r['greek'] = [' Γαζέες καὶ μυρτιὲς δὲν θὰ βρῶ πιὰ στὸ χρυσαφὶ ξέφωτο', true];
3333
$r['spanish'] = ['El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y frío, añoraba a su querido cachorro.', true];
34-
$r['french'] = ['Le cœur déçu mais l\'âme plutôt naïve, Louÿs rêva de crapaüter en canoë au delà des îles, près du mälström où brûlent les novæ.', true];
34+
$r['frenchWithUnicode'] = ['Le cœur déçu mais l\'âme plutôt naïve, Louÿs rêva de crapaüter en canoë au delà des îles, près du mälström où brûlent les novæ.', true];
35+
$r['frenchWithOnlyGSM'] = ['j\'étais donc plein de songes ! L\'espérance en chantant me berçait de mensonges. J\'étais donc cet enfant, hélas !', false];
3536
$r['icelandic'] = ['Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa ', true];
3637
$r['japanese-hiragana'] = ['いろはにほへとちりぬるを', true];
3738
$r['japanese-katakana'] = ['イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム', true];
@@ -40,6 +41,8 @@ public function unicodeProvider() {
4041
$r['russian'] = ['В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!', true];
4142
$r['thai'] = ['กว่าบรรดาฝูงสัตว์เดรัจฉาน', true];
4243
$r['turkish'] = ['Pijamalı hasta, yağız şoföre çabucak güvendi.', true];
44+
$r['LF'] = ["\n", false];
45+
$r['CR'] = ["\r", false];
4346

4447
return $r;
4548
}

0 commit comments

Comments
 (0)