44import sys
55import warnings
66
7+ import requests
8+
79import openai
810from openai .upload_progress import BufferReader
911from openai .validators import (
@@ -200,7 +202,10 @@ def create(cls, args):
200202 with open (args .file , "rb" ) as file_reader :
201203 buffer_reader = BufferReader (file_reader .read (), desc = "Upload progress" )
202204 resp = openai .File .create (
203- file = buffer_reader , purpose = args .purpose , model = args .model
205+ file = buffer_reader ,
206+ purpose = args .purpose ,
207+ model = args .model ,
208+ user_provided_filename = args .file ,
204209 )
205210 print (resp )
206211
@@ -238,52 +243,102 @@ def list(cls, args):
238243 print (resp )
239244
240245 @classmethod
241- def _get_or_upload (cls , file , check_if_file_exists = True ):
242- try :
243- openai .File .retrieve (file )
244- except openai .error .InvalidRequestError as e :
245- if e .http_status == 404 and os .path .isfile (file ):
246- matching_files = openai .File .find_matching_files (
247- file = open (file ), purpose = "fine-tune"
246+ def _is_url (cls , file : str ):
247+ return file .lower ().startswith ("http" )
248+
249+ @classmethod
250+ def _download_file_from_public_url (cls , url : str ) -> Optional [bytes ]:
251+ resp = requests .get (url )
252+ if resp .status_code == 200 :
253+ return resp .content
254+ else :
255+ return None
256+
257+ @classmethod
258+ def _maybe_upload_file (
259+ cls ,
260+ file : Optional [str ] = None ,
261+ content : Optional [bytes ] = None ,
262+ user_provided_file : Optional [str ] = None ,
263+ check_if_file_exists : bool = True ,
264+ ):
265+ # Exactly one of `file` or `content` must be provided
266+ if (file is None ) == (content is None ):
267+ raise ValueError ("Exactly one of `file` or `content` must be provided" )
268+
269+ if content is None :
270+ assert file is not None
271+ with open (file , "rb" ) as f :
272+ content = f .read ()
273+
274+ if check_if_file_exists :
275+ bytes = len (content )
276+ matching_files = openai .File .find_matching_files (
277+ name = user_provided_file or f .name , bytes = bytes , purpose = "fine-tune"
278+ )
279+ if len (matching_files ) > 0 :
280+ file_ids = [f ["id" ] for f in matching_files ]
281+ sys .stdout .write (
282+ "Found potentially duplicated files with name '{name}', purpose 'fine-tune' and size {size} bytes\n " .format (
283+ name = os .path .basename (matching_files [0 ]["filename" ]),
284+ size = matching_files [0 ]["bytes" ],
285+ )
248286 )
249- if len ( matching_files ) > 0 and check_if_file_exists :
250- file_ids = [ f [ "id" ] for f in matching_files ]
287+ sys . stdout . write ( " \n " . join ( file_ids ))
288+ while True :
251289 sys .stdout .write (
252- "Found potentially duplicated files with name '{name}', purpose 'fine-tune' and size {size} bytes\n " .format (
253- name = matching_files [0 ]["filename" ],
254- size = matching_files [0 ]["bytes" ],
255- )
290+ "\n Enter file ID to reuse an already uploaded file, or an empty string to upload this file anyway: "
256291 )
257- sys .stdout . write ( " \n " . join ( file_ids ) )
258- while True :
292+ inp = sys .stdin . readline (). strip ( )
293+ if inp in file_ids :
259294 sys .stdout .write (
260- "\n Enter file ID to reuse an already uploaded file, or an empty string to upload this file anyway: "
295+ "Reusing already uploaded file: {id} \n " . format ( id = inp )
261296 )
262- inp = sys .stdin .readline ().strip ()
263- if inp in file_ids :
264- sys .stdout .write (
265- "Using your file {file}: {id}\n " .format (
266- file = file , id = inp
267- )
268- )
269- return inp
270- elif inp == "" :
271- break
272- else :
273- sys .stdout .write (
274- "File id '{id}' is not among the IDs of the potentially duplicated files\n " .format (
275- id = inp
276- )
297+ return inp
298+ elif inp == "" :
299+ break
300+ else :
301+ sys .stdout .write (
302+ "File id '{id}' is not among the IDs of the potentially duplicated files\n " .format (
303+ id = inp
277304 )
305+ )
278306
279- resp = openai .File .create (
280- file = open (file ),
281- purpose = "fine-tune" ,
282- )
283- sys .stdout .write (
284- "Uploaded file from {file}: {id}\n " .format (file = file , id = resp ["id" ])
307+ buffer_reader = BufferReader (content , desc = "Upload progress" )
308+ resp = openai .File .create (
309+ file = buffer_reader ,
310+ purpose = "fine-tune" ,
311+ user_provided_filename = user_provided_file or file ,
312+ )
313+ sys .stdout .write (
314+ "Uploaded file from {file}: {id}\n " .format (
315+ file = user_provided_file or file , id = resp ["id" ]
316+ )
317+ )
318+ return resp ["id" ]
319+
320+ @classmethod
321+ def _get_or_upload (cls , file , check_if_file_exists = True ):
322+ try :
323+ # 1. If it's a valid file, use it
324+ openai .File .retrieve (file )
325+ return file
326+ except openai .error .InvalidRequestError :
327+ pass
328+ if os .path .isfile (file ):
329+ # 2. If it's a file on the filesystem, upload it
330+ return cls ._maybe_upload_file (
331+ file = file , check_if_file_exists = check_if_file_exists
332+ )
333+ if cls ._is_url (file ):
334+ # 3. If it's a URL, download it temporarily
335+ content = cls ._download_file_from_public_url (file )
336+ if content is not None :
337+ return cls ._maybe_upload_file (
338+ content = content ,
339+ check_if_file_exists = check_if_file_exists ,
340+ user_provided_file = file ,
285341 )
286- return resp ["id" ]
287342 return file
288343
289344 @classmethod
@@ -737,15 +792,15 @@ def help(args):
737792 "--training_file" ,
738793 required = True ,
739794 help = "JSONL file containing prompt-completion examples for training. This can "
740- "be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345) "
741- "or a local file path." ,
795+ "be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345), "
796+ ' a local file path, or a URL that starts with "http".' ,
742797 )
743798 sub .add_argument (
744799 "-v" ,
745800 "--validation_file" ,
746801 help = "JSONL file containing prompt-completion examples for validation. This can "
747- "be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345) "
748- "or a local file path." ,
802+ "be the ID of a file uploaded through the OpenAI API (e.g. file-abcde12345), "
803+ ' a local file path, or a URL that starts with "http".' ,
749804 )
750805 sub .add_argument (
751806 "--no_check_if_files_exist" ,
@@ -780,7 +835,7 @@ def help(args):
780835 type = float ,
781836 help = "The learning rate multiplier to use for training. The fine-tuning "
782837 "learning rate is determined by the original learning rate used for "
783- "pretraining multiplied by this value" ,
838+ "pretraining multiplied by this value. " ,
784839 )
785840 sub .add_argument (
786841 "--use_packing" ,
@@ -796,15 +851,15 @@ def help(args):
796851 "--no_packing" ,
797852 action = "store_false" ,
798853 dest = "use_packing" ,
799- help = "Disables the packing flag (see --use_packing for description)" ,
854+ help = "Disables the packing flag (see --use_packing for description). " ,
800855 )
801856 sub .set_defaults (use_packing = None )
802857 sub .add_argument (
803858 "--prompt_loss_weight" ,
804859 type = float ,
805860 help = "The weight to use for the prompt loss. The optimum value here depends "
806861 "depends on your use case. This determines how much the model prioritizes "
807- "learning from prompt tokens vs learning from completion tokens" ,
862+ "learning from prompt tokens vs learning from completion tokens. " ,
808863 )
809864 sub .add_argument (
810865 "--compute_classification_metrics" ,
@@ -817,13 +872,13 @@ def help(args):
817872 "--classification_n_classes" ,
818873 type = int ,
819874 help = "The number of classes in a classification task. This parameter is "
820- "required for multiclass classification" ,
875+ "required for multiclass classification. " ,
821876 )
822877 sub .add_argument (
823878 "--classification_positive_class" ,
824879 help = "The positive class in binary classification. This parameter is needed "
825880 "to generate precision, recall and F-1 metrics when doing binary "
826- "classification" ,
881+ "classification. " ,
827882 )
828883 sub .add_argument (
829884 "--classification_betas" ,
0 commit comments