@@ -6,6 +6,7 @@ import dotenv from "dotenv";
66import sax from "sax" ;
77import { Readable } from "stream" ;
88import { fileURLToPath } from "url" ;
9+ import { strict } from "assert" ;
910
1011dotenv . config ( ) ;
1112
@@ -21,13 +22,15 @@ const ai = new OpenAI({
2122
2223const MAXLEN = 5000 ;
2324
25+ const createParser = ( ) => ( sax as any ) . createStream ( true , { trim : false } , { strictEntities : true } ) ;
26+
2427async function translate ( language : string , filePath : string ) : Promise < void > {
2528 try {
2629 // Pipe the XML file into the parser.
2730 const input_dir = fileURLToPath (
2831 import . meta. resolve ( "../../xml" + filePath )
2932 ) ;
30- console . log ( "Translating file: " + input_dir ) ;
33+
3134 const translated : string = await recursivelyTranslate ( language , input_dir ) ;
3235
3336 const output_path = fileURLToPath (
@@ -52,19 +55,15 @@ async function recursivelyTranslate(
5255) : Promise < string > {
5356 // Recursive function to split and translate
5457 async function helper ( ori : string , force : boolean ) : Promise < string > {
55- ori = escapeXML ( ori ) ;
56-
5758 if ( ori . length < MAXLEN && ! force ) {
58- console . log ( "Translating chunk: " + ori . substring ( 0 , 50 ) + "..." ) ;
5959 return await translateChunk ( ori ) ; // translate the chunk
6060 }
6161
62- console . log ( "Chunk too large, splitting..." ) ;
6362 let subTranslated = "" ;
6463 // continue splitting the chunk
6564 // Create a SAX parser in strict mode to split source into chunks.
6665 await new Promise < void > ( ( resolve , reject ) => {
67- const subParser = ( sax as any ) . createStream ( true , { trim : false } ) ;
66+ const subParser = createParser ( ) ;
6867
6968 let subCurrentDepth = 0 ;
7069 let subCurrentSegment = "" ;
@@ -87,12 +86,22 @@ async function recursivelyTranslate(
8786 } ) ;
8887
8988 subParser . on ( "text" , text => {
89+ text = strongEscapeXML ( text ) ;
9090 if ( subIsRecording ) {
91- subCurrentSegment += ` ${ text } ` ;
91+ subCurrentSegment += text ;
9292 } else {
93- if ( subSegments . length > 0 && subSegments [ subSegments . length - 1 ] [ 1 ] != undefined ) {
93+ if (
94+ subSegments . length > 0 &&
95+ subSegments [ subSegments . length - 1 ] [ 1 ] != undefined
96+ ) {
9497 subSegments [ subSegments . length - 1 ] [ 1 ] += text ;
9598 subSegments [ subSegments . length - 1 ] [ 0 ] = true ;
99+
100+ // if (text == "\n " || text == "\r\n " || text == ", \n" || text == ", \r\n") {
101+ // subSegments.push([false, text]);
102+ // } else {
103+ // subSegments.push([true, text]);
104+ // }
96105 } else {
97106 subSegments . push ( [ true , text ] ) ;
98107 }
@@ -141,7 +150,6 @@ async function recursivelyTranslate(
141150 subTranslated += segment [ 1 ] ;
142151 }
143152 }
144- console . log ( `Completed chunk translation, continuing...` ) ;
145153 resolve ( ) ;
146154 } ) ;
147155
@@ -154,7 +162,7 @@ async function recursivelyTranslate(
154162 }
155163
156164 // Create a SAX parser in strict mode to split source into chunks.
157- const parser = ( sax as any ) . createStream ( true , { trim : false } ) ;
165+ const parser = createParser ( ) ;
158166
159167 // const assistant = await createAssistant(language, ai);
160168 const assistant_id = "asst_BLVYfog5DpWrbu3fW3o2oD4r" ;
@@ -191,8 +199,9 @@ async function recursivelyTranslate(
191199 } ) ;
192200
193201 parser . on ( "text" , text => {
202+ text = strongEscapeXML ( text ) ;
194203 if ( isRecording ) {
195- currentSegment += ` ${ text } ` ;
204+ currentSegment += text ;
196205 } else {
197206 segments . push ( [ false , text ] ) ;
198207 }
@@ -287,18 +296,19 @@ async function recursivelyTranslate(
287296 const text = messageContent . text ;
288297
289298 const safeText = escapeXML ( text . value ) ;
299+ console . log ( safeText ) ;
290300 const textStream = Readable . from ( "<WRAPPER>" + safeText + "</WRAPPER>" ) ;
291301
292302 await new Promise < void > ( ( resolve , reject ) => {
293303 // Create a SAX parser in strict mode for cleaning up translations.
294- const clean = ( sax as any ) . createStream ( true , { trim : false } ) ;
304+ const clean = createParser ( ) ;
295305
296306 // SAX parser to remove any excess text (artifacts, annotations etc.) from LLM outside of XML tags
297307 let currDepth = - 1 ;
298308
299309 clean . on ( "text" , text => {
300310 if ( currDepth >= 1 ) {
301- translatedChunk += escapeXML ( text ) ;
311+ translatedChunk += strongEscapeXML ( text ) ;
302312 }
303313 } ) ;
304314
@@ -368,3 +378,12 @@ function formatAttributes(attrs) {
368378function escapeXML ( str : string ) : string {
369379 return str . replace ( / & (? ! (?: a m p ; | l t ; | g t ; | a p o s ; | q u o t ; ) ) / g, "&" ) ;
370380}
381+
382+ function strongEscapeXML ( str : string ) : string {
383+ return str
384+ . replace ( / & (? ! (?: a m p ; | l t ; | g t ; | a p o s ; | q u o t ; ) ) / g, "&" )
385+ . replace ( / < / g, "<" )
386+ . replace ( / > / g, ">" )
387+ . replace ( / " / g, """ )
388+ . replace ( / ' / g, "'" ) ;
389+ }
0 commit comments