1+ import fs from "fs" ;
2+ import OpenAI from "openai" ;
3+ import path from "path" ;
4+ import createAssistant from "../initializers/initialize" ;
5+ import dotenv from "dotenv" ;
6+ import sax from "sax" ;
7+ import { Readable } from "stream" ;
8+ import { fileURLToPath } from "url" ;
9+
10+ dotenv . config ( ) ;
11+
12+ if ( process . env . AI_MODEL === undefined || process . env . API_KEY === undefined ) {
13+ throw Error ( "Please specify AI_MODEL and API_KEY!" ) ;
14+ }
15+
16+ // initialize OpenAI API
17+ const ai = new OpenAI ( {
18+ apiKey : process . env . API_KEY ,
19+ baseURL : process . env . AI_BASEURL
20+ } ) ;
21+
22+ // TODO: change the toTranslate to a file path, read the file and translate the content
23+ async function translate ( language : string , filePath : string ) {
24+ // Create a SAX parser in strict mode to split source into chunks.
25+ const parser = ( sax as any ) . createStream ( true , { trim : false } ) ;
26+
27+ // const assistant = await createAssistant(language, ai);
28+ const assistant_id = "asst_BLVYfog5DpWrbu3fW3o2oD4r" ;
29+ const thread = await ai . beta . threads . create ( ) ;
30+ let translated = "" ;
31+
32+ console . dir ( thread ) ;
33+ // Variables to track current depth and segments.
34+ let currentDepth = 0 ;
35+ let currentSegment = "" ;
36+ const segments : [ boolean , string ] [ ] = [ ] ;
37+
38+ // In this context:
39+ // - Depth 0: Before any element is opened.
40+ // - Depth 1: The root element (<CHAPTER>).
41+ // - Depth 2: Each direct child of the root that we want to capture.
42+ let isRecording = false ;
43+
44+ parser . on ( "opentag" , node => {
45+ currentDepth ++ ;
46+
47+ // If we're at depth 2, this is the start of a new segment.
48+ if ( currentDepth === 2 || isRecording ) {
49+ isRecording = true ;
50+ currentSegment += `<${ node . name } ${ formatAttributes ( node . attributes ) } >` ;
51+ } else {
52+ segments . push ( [
53+ false ,
54+ `<${ node . name } ${ formatAttributes ( node . attributes ) } >`
55+ ] ) ;
56+ }
57+ } ) ;
58+
59+ parser . on ( "text" , text => {
60+ if ( isRecording ) {
61+ currentSegment += `${ text } ` ;
62+ } else {
63+ segments . push ( [ false , text ] ) ;
64+ }
65+ } ) ;
66+
67+ parser . on ( "cdata" , cdata => {
68+ if ( isRecording ) {
69+ currentSegment += `<![CDATA[${ cdata } ]]>` ;
70+ }
71+ } ) ;
72+
73+ parser . on ( "closetag" , tagName => {
74+ if ( isRecording ) {
75+ currentSegment += `</${ tagName } >` ;
76+ }
77+
78+ if ( currentDepth === 2 ) {
79+ // We are closing a segment element.
80+ segments . push ( [ true , currentSegment ] ) ;
81+ currentSegment = "" ;
82+ isRecording = false ;
83+ }
84+
85+ if ( currentDepth === 1 ) {
86+ // We are closing the root element.
87+ segments . push ( [ false , `</${ tagName } >` ] ) ;
88+ }
89+
90+ currentDepth -- ;
91+ } ) ;
92+
93+ parser . on ( "comment" , comment => {
94+ if ( isRecording ) {
95+ currentSegment += `<!-- ${ comment } -->` ;
96+ } else {
97+ segments . push ( [ false , `<!-- ${ comment } -->` ] ) ;
98+ }
99+ } ) ;
100+
101+ parser . on ( "end" , async ( ) => {
102+ for ( const segment of segments ) {
103+ if ( segment [ 0 ] ) {
104+ translated += await translateChunk ( segment [ 1 ] ) ;
105+ } else {
106+ translated += segment [ 1 ] ;
107+ }
108+ }
109+ console . log ( `Done translating all segments.` ) ;
110+ const output_path = fileURLToPath (
111+ import . meta. resolve ( "../../xml/translations" + filePath )
112+ ) ;
113+
114+ // Ensure directory exists
115+ const dir = path . dirname ( output_path ) ;
116+ fs . mkdirSync ( dir , { recursive : true } ) ;
117+
118+ fs . writeFileSync ( output_path , translated ) ;
119+ console . log ( `Translation saved to ${ output_path } ` ) ;
120+ } ) ;
121+
122+ try {
123+ // Pipe the XML file into the parser.
124+ const input_dir = fileURLToPath (
125+ import . meta. resolve ( "../../xml" + filePath )
126+ ) ;
127+ console . log ( input_dir ) ;
128+ fs . createReadStream ( input_dir ) . pipe ( parser ) ;
129+ } catch ( parseErr ) {
130+ console . error ( "Error parsing XML:" , parseErr ) ;
131+ }
132+
133+ async function translateChunk ( chunk : string ) {
134+ // console.log("translating chunk: " + chunk);
135+ // Create a SAX parser in strict mode for cleaning up translations.
136+ const clean = ( sax as any ) . createStream ( true , { trim : false } ) ;
137+
138+ // SAX parser to remove any excess text (artifacts, annotations etc.) from LLM outside of XML tags
139+ let currDepth = - 1 ;
140+
141+ clean . on ( "text" , text => {
142+ if ( currDepth >= 1 ) {
143+ translated += text ;
144+ }
145+ } ) ;
146+
147+ clean . on ( "opentag" , node => {
148+ currDepth ++ ;
149+ if ( node . name != "WRAPPER" ) {
150+ translated += `<${ node . name } ${ formatAttributes ( node . attributes ) } >` ;
151+ }
152+ } ) ;
153+
154+ clean . on ( "closetag" , tagName => {
155+ if ( tagName != "WRAPPER" ) {
156+ translated += `</${ tagName } >` ;
157+ }
158+ currDepth -- ;
159+ } ) ;
160+
161+ clean . on ( "cdata" , cdata => {
162+ translated += `<![CDATA[${ cdata } ]]>` ;
163+ } ) ;
164+
165+ clean . on ( "comment" , comment => {
166+ translated += `<!-- ${ comment } -->` ;
167+ } ) ;
168+
169+ let translated = "" ;
170+
171+ try {
172+ await ai . beta . threads . messages . create ( thread . id , {
173+ role : "user" ,
174+ content : `Translate this content to ${ language } .
175+ IMPORTANT: You MUST search the uploaded reference file for any technical terms and use EXACTLY the translations specified there.
176+ If a term exists in the reference file, use that translation without deviation.
177+ Do not modify XML tags, content of XML tags and structure. Do not say anything else. Only translate the content and return the xml as is.
178+ Content to translate:
179+ ${ chunk } `
180+ } ) ;
181+ const run = await ai . beta . threads . runs . createAndPoll ( thread . id , {
182+ assistant_id : assistant_id
183+ } ) ;
184+
185+ const messages = await ai . beta . threads . messages . list ( thread . id , {
186+ run_id : run . id
187+ } ) ;
188+ const message = messages . data . pop ( ) ! ;
189+ const messageContent = message . content [ 0 ] ;
190+
191+ if ( messageContent . type !== "text" ) {
192+ throw new Error (
193+ `Unexpected message content type: ${ messageContent . type } `
194+ ) ;
195+ }
196+
197+ const text = messageContent . text ;
198+ // console.log(text.value);
199+
200+ const safeText = escapeXML ( text . value ) ;
201+ const textStream = Readable . from ( "<WRAPPER>" + safeText + "</WRAPPER>" ) ;
202+
203+ await new Promise < void > ( ( resolve , reject ) => {
204+ clean . once ( "end" , resolve ) ;
205+ clean . once ( "error" , reject ) ;
206+ textStream . pipe ( clean ) ;
207+ } ) ;
208+
209+ return translated ;
210+ } catch ( err ) {
211+ console . log ( `Error occured while translating ${ filePath } :\n ` + err ) ;
212+ }
213+ }
214+ }
215+
216+ export default translate ;
217+
218+ // Helper function to format attributes into a string.
219+ function formatAttributes ( attrs ) {
220+ const attrStr = Object . entries ( attrs )
221+ . map ( ( [ key , val ] ) => `${ key } ="${ val } "` )
222+ . join ( " " ) ;
223+ return attrStr ? " " + attrStr : "" ;
224+ }
225+
226+ function escapeXML ( str : string ) : string {
227+ return str . replace ( / & (? ! (?: a m p ; | l t ; | g t ; | a p o s ; | q u o t ; ) ) / g, "&" ) ;
228+ }
0 commit comments