@@ -19,15 +19,15 @@ const ai = new OpenAI({
1919 baseURL : process . env . AI_BASEURL
2020} ) ;
2121
22- const MAXLEN = 2000 ;
22+ const MAXLEN = 5000 ;
2323
2424async function translate ( language : string , filePath : string ) : Promise < void > {
2525 try {
2626 // Pipe the XML file into the parser.
2727 const input_dir = fileURLToPath (
2828 import . meta. resolve ( "../../xml" + filePath )
2929 ) ;
30- console . log ( input_dir ) ;
30+ console . log ( "Translating file: " + input_dir ) ;
3131 const translated : string = await recursivelyTranslate ( language , input_dir ) ;
3232
3333 const output_path = fileURLToPath (
@@ -52,10 +52,14 @@ async function recursivelyTranslate(
5252) : Promise < string > {
5353 // Recursive function to split and translate
5454 async function helper ( ori : string , force : boolean ) : Promise < string > {
55+ ori = escapeXML ( ori ) ;
56+
5557 if ( ori . length < MAXLEN && ! force ) {
58+ console . log ( "Translating chunk: " + ori . substring ( 0 , 50 ) + "..." ) ;
5659 return await translateChunk ( ori ) ; // translate the chunk
5760 }
5861
62+ console . log ( "Chunk too large, splitting..." ) ;
5963 let subTranslated = "" ;
6064 // continue splitting the chunk
6165 // Create a SAX parser in strict mode to split source into chunks.
@@ -86,7 +90,11 @@ async function recursivelyTranslate(
8690 if ( subIsRecording ) {
8791 subCurrentSegment += `${ text } ` ;
8892 } else {
89- subSegments . push ( [ false , text ] ) ;
93+ if ( text == "\n " || text == "\r\n " || text == ", \n" || text == ", \r\n" ) {
94+ subSegments . push ( [ false , text ] ) ;
95+ } else {
96+ subSegments . push ( [ true , text ] ) ;
97+ }
9098 }
9199 } ) ;
92100
@@ -132,7 +140,7 @@ async function recursivelyTranslate(
132140 subTranslated += segment [ 1 ] ;
133141 }
134142 }
135- console . log ( `Done translating all segments .` ) ;
143+ console . log ( `Completed chunk translation, continuing.. .` ) ;
136144 resolve ( ) ;
137145 } ) ;
138146
@@ -232,7 +240,7 @@ async function recursivelyTranslate(
232240 }
233241 }
234242 console . log ( `Done translating all segments.` ) ;
235- resolve ( )
243+ resolve ( ) ;
236244 } ) ;
237245
238246 parser . on ( "error" , reject ) ;
@@ -247,69 +255,17 @@ async function recursivelyTranslate(
247255 }
248256
249257 async function translateChunk ( chunk : string ) : Promise < string > {
250- // console.log("translating chunk: " + chunk);
251- // Create a SAX parser in strict mode for cleaning up translations.
252- const clean = ( sax as any ) . createStream ( true , { trim : false } ) ;
253-
254- // SAX parser to remove any excess text (artifacts, annotations etc.) from LLM outside of XML tags
255- let currDepth = - 1 ;
256-
257- clean . on ( "text" , text => {
258- if ( currDepth >= 1 ) {
259- translatedChunk += escapeXML ( text ) ;
260- }
261- } ) ;
262-
263- clean . on ( "opentag" , node => {
264- currDepth ++ ;
265- if ( node . name != "WRAPPER" ) {
266- translatedChunk += `<${ node . name } ${ formatAttributes ( node . attributes ) } >` ;
267- }
268- } ) ;
269-
270- clean . on ( "closetag" , tagName => {
271- if ( tagName != "WRAPPER" ) {
272- translatedChunk += `</${ tagName } >` ;
273- }
274- currDepth -- ;
275- } ) ;
276-
277- clean . on ( "cdata" , cdata => {
278- translatedChunk += `<![CDATA[${ cdata } ]]>` ;
279- } ) ;
280-
281- clean . on ( "comment" , comment => {
282- translatedChunk += `<!-- ${ comment } -->` ;
283- } ) ;
284-
285- clean . on ( "error" , error => {
286- console . log (
287- "error encountered when validating XML: " +
288- error +
289- "\nvalidating section: " +
290- chunk . substring ( 0 , 100 ) +
291- "..."
292- ) ;
293-
294- // Attempt to recover using the internal parser
295- try {
296- clean . _parser . resume ( ) ;
297- } catch ( e ) {
298- console . log ( "Failed to resume parser:" , e ) ;
299- }
300- } ) ;
301-
302258 let translatedChunk = "" ;
303259
304260 try {
305261 await ai . beta . threads . messages . create ( thread . id , {
306262 role : "user" ,
307263 content : `Translate this content to ${ language } .
308- IMPORTANT: You MUST search the uploaded reference file for any technical terms and use EXACTLY the translations specified there.
309- If a term exists in the reference file, use that translation without deviation.
310- Do not modify XML tags, attributes of XML tags and structure. Do not say anything else.
311- Content to translate:
312- ${ chunk } `
264+ IMPORTANT: You MUST search the uploaded reference file for any technical terms and use EXACTLY the translations specified there.
265+ If a term exists in the reference file, use that translation without deviation.
266+ Do not modify XML tags, attributes of XML tags and structure. Do not say anything else.
267+ Content to translate:
268+ ${ chunk } `
313269 } ) ;
314270 const run = await ai . beta . threads . runs . createAndPoll ( thread . id , {
315271 assistant_id : assistant_id
@@ -328,14 +284,65 @@ async function recursivelyTranslate(
328284 }
329285
330286 const text = messageContent . text ;
331- // console.log(text.value);
332287
333288 const safeText = escapeXML ( text . value ) ;
334289 const textStream = Readable . from ( "<WRAPPER>" + safeText + "</WRAPPER>" ) ;
335290
336291 await new Promise < void > ( ( resolve , reject ) => {
292+ // Create a SAX parser in strict mode for cleaning up translations.
293+ const clean = ( sax as any ) . createStream ( true , { trim : false } ) ;
294+
295+ // SAX parser to remove any excess text (artifacts, annotations etc.) from LLM outside of XML tags
296+ let currDepth = - 1 ;
297+
298+ clean . on ( "text" , text => {
299+ if ( currDepth >= 1 ) {
300+ translatedChunk += escapeXML ( text ) ;
301+ }
302+ } ) ;
303+
304+ clean . on ( "opentag" , node => {
305+ currDepth ++ ;
306+ if ( node . name != "WRAPPER" ) {
307+ translatedChunk += `<${ node . name } ${ formatAttributes ( node . attributes ) } >` ;
308+ }
309+ } ) ;
310+
311+ clean . on ( "closetag" , tagName => {
312+ if ( tagName != "WRAPPER" ) {
313+ translatedChunk += `</${ tagName } >` ;
314+ }
315+ currDepth -- ;
316+ } ) ;
317+
318+ clean . on ( "cdata" , cdata => {
319+ translatedChunk += `<![CDATA[${ cdata } ]]>` ;
320+ } ) ;
321+
322+ clean . on ( "comment" , comment => {
323+ translatedChunk += `<!-- ${ comment } -->` ;
324+ } ) ;
325+
326+ clean . on ( "error" , error => {
327+ console . log (
328+ "error encountered when validating XML: " +
329+ error +
330+ "\nvalidating section: " +
331+ chunk . substring ( 0 , 100 ) +
332+ "..."
333+ ) ;
334+
335+ // Attempt to recover using the internal parser
336+ try {
337+ clean . _parser . resume ( ) ;
338+ } catch ( e ) {
339+ console . log ( "Failed to resume parser:" , e ) ;
340+ reject ;
341+ }
342+ } ) ;
343+
337344 clean . once ( "end" , resolve ) ;
338- clean . once ( "error" , reject ) ;
345+
339346 textStream . pipe ( clean ) ;
340347 } ) ;
341348
0 commit comments