Skip to content

Commit 4ab63fc

Browse files
committed
Preparations for next version release
1 parent bbdb9b9 commit 4ab63fc

File tree

4 files changed

+82
-32
lines changed

4 files changed

+82
-32
lines changed

README.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ Now that you've loaded the module, you can [start using it](#usage-1).
103103

104104
##### Usage
105105

106-
The `<script>` tag exposes the `languageEncoding` function to everything in the DOM located beneath it. When you call it and pass in the file that you want to analyze, it'll return a Promise that you can use to retrieve the encoding, language and confidenc score as shown in the example below.
106+
The `<script>` tag exposes the `languageEncoding` function to everything in the DOM located beneath it. When you call it and pass in the file that you want to analyze, it'll return a Promise that you can use to retrieve the encoding, language and confidence score as shown in the example below.
107107

108108
```js
109109
// app.js
@@ -116,7 +116,7 @@ function inputHandler(e) {
116116
const file = e.target.files[0];
117117

118118
languageEncoding(file).then((fileInfo) => console.log(fileInfo));
119-
// Possible result: { language: english, encoding: UTF-8, confidence: 0.97}
119+
// Possible result: { language: english, encoding: UTF-8, confidence: { encoding: 1, language: 1 } }
120120
}
121121
```
122122

@@ -143,7 +143,7 @@ function inputHandler(e) {
143143
const file = e.target.files[0];
144144

145145
languageEncoding(file).then((fileInfo) => console.log(fileInfo));
146-
// Possible result: { language: english, encoding: UTF-8, confidence: 0.97}
146+
// Possible result: { language: french, encoding: CP1252, confidence: { encoding: 1, language: 0.97 } }
147147
}
148148
```
149149

@@ -167,7 +167,7 @@ const languageEncoding = require("detect-file-encoding-and-language");
167167
const pathToFile = "/home/username/documents/my-text-file.txt";
168168

169169
languageEncoding(pathToFile).then((fileInfo) => console.log(fileInfo));
170-
// Possible result: { language: japanese, encoding: Shift-JIS, confidence: 1 }
170+
// Possible result: { language: japanese, encoding: Shift-JIS, confidence: { encoding: 0.94, language: 0.94 } }
171171
```
172172

173173
### In the terminal (CLI)
@@ -184,14 +184,14 @@ Once installed you'll be able to use the command `dfeal` to retrieve the encodin
184184

185185
```bash
186186
$ dfeal "/home/user name/Documents/subtitle file.srt"
187-
# Possible result: { language: french, encoding: CP1252, confidence: 0.99 }
187+
# Possible result: { language: french, encoding: CP1252, confidence: { encoding: 0.99, language: 0.99 } }
188188
```
189189

190190
or without quotation marks, using backslashes to escape spaces:
191191

192192
```bash
193193
$ dfeal /home/user\ name/Documents/subtitle\ file.srt
194-
# Possible result: { language: french, encoding: CP1252, confidence: 0.99 }
194+
# Possible result: { language: french, encoding: CP1252, confidence: { encoding: 0.97, language: 0.97 } }
195195
```
196196

197197
## Supported Languages
@@ -256,7 +256,7 @@ $ dfeal /home/user\ name/Documents/subtitle\ file.srt
256256

257257
## Confidence Score
258258

259-
The confidence score ranges from 0 to 1. It is based on the amount of matches that were found for a particular language and the frequency of those matches. If you want to learn more about how it all works, check out the [Wiki entry](https://github.com/gignupg/Detect-File-Encoding-and-Language/wiki)!
259+
The confidence score ranges from 0 to 1. It's an object that contains two different confidence scores. The language confidence score and the encoding confidence score. Both confidence scores will be the same if the detected encoding is Unicode. Otherwise the confidence score for the language and the encoding is calculated seperately. It is based on the amount of matches that were found for a particular language and the frequency of those matches. If you want to learn more about how it all works, check out the [Wiki entry](https://github.com/gignupg/Detect-File-Encoding-and-Language/wiki)!
260260

261261
## Known Issues
262262

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
{
22
"name": "detect-file-encoding-and-language",
3-
"version": "1.7.4",
3+
"version": "2.0.1",
44
"description": "Charset Detector - Detect the encoding and language of any file - Use it in the browser, with Node.js, or via CLI",
55
"main": "src/index-node.js",
66
"scripts": {
77
"regextest": "node ./testing/regexTester.test.js",
88
"test": "node ./testing/language-encoding.test.js",
9+
"build": "browserify --standalone ./src/index-browser.js > ./umd/language-encoding.min.js",
910
"prepublishOnly": "npm test"
1011
},
1112
"browser": "umd/language-encoding.min.js",

src/index-browser.js

Lines changed: 73 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,85 @@
1-
const checkUTF = require('./components/checkUTF.js');
2-
const processContent = require('./components/processContent.js');
1+
const checkUTF = require("./components/checkUTF.js");
2+
const processContent = require("./components/processContent.js");
3+
const checkByteOrderMark = require("./components/checkByteOrderMark.js");
34

4-
module.exports = (file, test) => {
5-
return new Promise((resolve, reject) => {
6-
const data = {};
7-
const utfReader = new FileReader();
5+
module.exports = (file) => {
6+
return new Promise((resolve, reject) => {
7+
const fileInfo = {
8+
encoding: null,
9+
language: null,
10+
confidence: {
11+
encoding: null,
12+
language: null,
13+
},
14+
};
15+
const data = {};
816

9-
utfReader.onerror = (err) => {
10-
reject(err);
17+
// Check the byte order mark!
18+
const byteOrderMarkBuffer = new FileReader();
19+
20+
byteOrderMarkBuffer.onload = () => {
21+
const uInt8String = new Uint8Array(byteOrderMarkBuffer.result)
22+
.slice(0, 4)
23+
.join(" ");
24+
const byteOrderMark = checkByteOrderMark(uInt8String);
25+
26+
if (byteOrderMark) {
27+
fileInfo.encoding = byteOrderMark;
28+
fileInfo.confidence.encoding = 1;
29+
30+
const byteOrderMarkReader = new FileReader();
31+
32+
byteOrderMarkReader.onload = () => {
33+
data.content = byteOrderMarkReader.result;
34+
resolve(processContent(data, fileInfo));
35+
};
36+
37+
byteOrderMarkReader.onerror = (err) => {
38+
reject(err);
1139
};
1240

41+
byteOrderMarkReader.readAsText(file, fileInfo.encoding);
42+
} else {
43+
// Read with UTF-8 first, then with ISO-8859-1
44+
const utfReader = new FileReader();
45+
1346
utfReader.onload = () => {
14-
const utfContent = utfReader.result;
47+
const utfContent = utfReader.result;
1548

16-
data.utf8 = checkUTF(utfContent);
49+
const utf8 = checkUTF(utfContent);
1750

18-
if (data.utf8) {
19-
data.content = utfContent;
20-
resolve(processContent(data));
51+
if (utf8) {
52+
fileInfo.encoding = "UTF-8";
53+
fileInfo.confidence.encoding = 1;
54+
}
2155

22-
} else {
23-
const isoReader = new FileReader();
56+
if (utf8) {
57+
data.content = utfContent;
58+
resolve(processContent(data, fileInfo));
59+
} else {
60+
const isoReader = new FileReader();
2461

25-
isoReader.onload = () => {
26-
data.content = isoReader.result;
27-
resolve(processContent(data));
28-
};
62+
isoReader.onload = () => {
63+
data.content = isoReader.result;
64+
resolve(processContent(data, fileInfo));
65+
};
66+
67+
isoReader.readAsText(file, "ISO-8859-1");
68+
}
69+
};
2970

30-
isoReader.readAsText(file, "ISO-8859-1");
31-
}
71+
utfReader.onerror = (err) => {
72+
reject(err);
3273
};
74+
3375
utfReader.readAsText(file, "UTF-8");
34-
});
35-
};
76+
}
77+
};
78+
79+
byteOrderMarkBuffer.onerror = (err) => {
80+
reject(err);
81+
};
82+
83+
byteOrderMarkBuffer.readAsArrayBuffer(file);
84+
});
85+
};

src/index-node.js

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ module.exports = (file, test) => {
1919
// Reading the first four bytes and checking if they coincide with one of the predefined byte order marks.
2020
const readStream = fs.createReadStream(file, { start: 0, end: 3 });
2121

22-
// This will wait until we know the readable stream is actually valid before piping
2322
readStream.on("data", function (buffer) {
2423
const uInt8Array = new Uint8Array(buffer);
2524
const uInt8String = uInt8Array.join(" ");

0 commit comments

Comments
 (0)