1212import argparse
1313import platform
1414import Xlib .display
15+ import Xlib .X
16+ import Xlib .Xutil # not sure if Xutil is necessary
1517
1618from prompt_toolkit import prompt
1719from prompt_toolkit .shortcuts import message_dialog
3133client .api_key = os .getenv ("OPENAI_API_KEY" )
3234client .base_url = os .getenv ("OPENAI_API_BASE_URL" , client .base_url )
3335
36+ monitor_size = {
37+ "width" : 1920 ,
38+ "height" : 1080 ,
39+ }
40+
3441VISION_PROMPT = """
3542You are a Self-Operating Computer. You use the same operating system as a human.
3643
4653Here are the response formats below.
4754
48551. CLICK
49- Response: CLICK {{ "x": "percent", "y": "percent", "description": "~description here~", "reason": "~reason here~" }}
56+ Response: CLICK {{ "x": "percent", "y": "percent", "description": "~description here~", "reason": "~reason here~" }}
57+ Note that the percents work where the top left corner is "x": "0%" and "y": "0%" and the bottom right corner is "x": "100%" and "y": "100%"
5058
51592. TYPE
5260Response: TYPE "value you want to type"
8896Objective: {objective}
8997"""
9098
99+ ACCURATE_PIXEL_COUNT = 200 # mini_screenshot is ACCURATE_PIXEL_COUNT x ACCURATE_PIXEL_COUNT big
100+ ACCURATE_MODE_VISION_PROMPT = """
101+ It looks like your previous attempted action was clicking on "x": {prev_x}, "y": {prev_y}. This has now been moved to the center of this screenshot.
102+ As additional context to the previous message, before you decide the proper percentage to click on, please closely examine this additional screenshot as additional context for your next action.
103+ This screenshot was taken around the location of the current cursor that you just tried clicking on ("x": {prev_x}, "y": {prev_y} is now at the center of this screenshot). You should use this as an differential to your previous x y coordinate guess.
104+
105+ If you want to refine and instead click on the top left corner of this mini screenshot, you will subtract {width}% in the "x" and subtract {height}% in the "y" to your previous answer.
106+ Likewise, to achieve the bottom right of this mini screenshot you will add {width}% in the "x" and add {height}% in the "y" to your previous answer.
107+
108+ There are four segmenting lines across each dimension, divided evenly. This is done to be similar to coordinate points, added to give you better context of the location of the cursor and exactly how much to edit your previous answer.
109+
110+ Please use this context as additional info to further refine the "percent" location in the CLICK action!
111+ """
91112
92113USER_QUESTION = "Hello, I can help you with anything. What would you like done?"
93114
@@ -171,7 +192,7 @@ def supports_ansi():
171192 ANSI_BRIGHT_MAGENTA = ""
172193
173194
174- def main (model ):
195+ def main (model , accurate_mode ):
175196 """
176197 Main function for the Self-Operating Computer
177198 """
@@ -209,7 +230,7 @@ def main(model):
209230 if DEBUG :
210231 print ("[loop] messages before next action:\n \n \n " , messages [1 :])
211232 try :
212- response = get_next_action (model , messages , objective )
233+ response = get_next_action (model , messages , objective , accurate_mode )
213234 action = parse_oai_response (response )
214235 action_type = action .get ("type" )
215236 action_detail = action .get ("data" )
@@ -291,9 +312,19 @@ def format_vision_prompt(objective, previous_action):
291312 return prompt
292313
293314
294- def get_next_action (model , messages , objective ):
315+ def format_accurate_mode_vision_prompt (prev_x , prev_y ):
316+ """
317+ Format the accurate mode vision prompt
318+ """
319+ width = ((ACCURATE_PIXEL_COUNT / 2 )/ monitor_size ['width' ]) * 100
320+ height = ((ACCURATE_PIXEL_COUNT / 2 )/ monitor_size ['height' ]) * 100
321+ prompt = ACCURATE_MODE_VISION_PROMPT .format (prev_x = prev_x , prev_y = prev_y , width = width , height = height )
322+ return prompt
323+
324+
325+ def get_next_action (model , messages , objective , accurate_mode ):
295326 if model == "gpt-4-vision-preview" :
296- content = get_next_action_from_openai (messages , objective )
327+ content = get_next_action_from_openai (messages , objective , accurate_mode )
297328 return content
298329 elif model == "agent-1" :
299330 return "coming soon"
@@ -314,8 +345,56 @@ def get_last_assistant_message(messages):
314345 return messages [index ]
315346 return None # Return None if no assistant message is found
316347
348+ def accurate_mode_double_check (pseudo_messages , prev_x , prev_y ):
349+ """
350+ Reprompt OAI with additional screenshot of a mini screenshot centered around the cursor for further finetuning of clicked location
351+ """
352+ try :
353+ screenshot_filename = os .path .join (
354+ "screenshots" , "screenshot_mini.png"
355+ )
356+ capture_mini_screenshot_with_cursor (file_path = screenshot_filename , x = prev_x , y = prev_y )
357+
358+ new_screenshot_filename = os .path .join (
359+ "screenshots" , "screenshot_mini_with_grid.png"
360+ )
361+
362+ with open (new_screenshot_filename , "rb" ) as img_file :
363+ img_base64 = base64 .b64encode (img_file .read ()).decode ("utf-8" )
364+
365+ accurate_vision_prompt = format_accurate_mode_vision_prompt (prev_x , prev_y )
317366
318- def get_next_action_from_openai (messages , objective ):
367+ accurate_mode_message = {
368+ "role" : "user" ,
369+ "content" : [
370+ {"type" : "text" , "text" : accurate_vision_prompt },
371+ {
372+ "type" : "image_url" ,
373+ "image_url" : {"url" : f"data:image/jpeg;base64,{ img_base64 } " },
374+ },
375+ ],
376+ }
377+
378+ pseudo_messages .append (accurate_mode_message )
379+
380+ response = client .chat .completions .create (
381+ model = "gpt-4-vision-preview" ,
382+ messages = pseudo_messages ,
383+ presence_penalty = 1 ,
384+ frequency_penalty = 1 ,
385+ temperature = 0.7 ,
386+ max_tokens = 300 ,
387+ )
388+
389+ content = response .choices [0 ].message .content
390+
391+ return content
392+ except Exception as e :
393+ print (f"Error reprompting model for accurate_mode: { e } " )
394+ return "ERROR"
395+
396+
397+ def get_next_action_from_openai (messages , objective , accurate_mode ):
319398 """
320399 Get the next action for Self-Operating Computer
321400 """
@@ -355,6 +434,7 @@ def get_next_action_from_openai(messages, objective):
355434 },
356435 ],
357436 }
437+
358438 # create a copy of messages and save to pseudo_messages
359439 pseudo_messages = messages .copy ()
360440 pseudo_messages .append (vision_message )
@@ -374,7 +454,23 @@ def get_next_action_from_openai(messages, objective):
374454 "content" : "`screenshot.png`" ,
375455 }
376456 )
457+
377458 content = response .choices [0 ].message .content
459+
460+ if accurate_mode :
461+ if content .startswith ("CLICK" ):
462+ # Adjust pseudo_messages to include the accurate_mode_message
463+
464+ click_data = re .search (r"CLICK \{ (.+) \}" , content ).group (1 )
465+ click_data_json = json .loads (f"{{{ click_data } }}" )
466+ prev_x = click_data_json ["x" ]
467+ prev_y = click_data_json ["y" ]
468+
469+ if DEBUG :
470+ print (f"Previous coords before accurate tuning: prev_x { prev_x } prev_y { prev_y } " )
471+ content = accurate_mode_double_check (pseudo_messages , prev_x , prev_y )
472+ assert content != "ERROR" , "ERROR: accurate_mode_double_check failed"
473+
378474 return content
379475
380476 except Exception as e :
@@ -445,7 +541,6 @@ def summarize(messages, objective):
445541 print (f"Error parsing JSON: { e } " )
446542 return "Failed to summarize the workflow"
447543
448-
449544def mouse_click (click_detail ):
450545 try :
451546 x = convert_percent_to_decimal (click_detail ["x" ])
@@ -575,7 +670,51 @@ def search(text):
575670 return "Open program: " + text
576671
577672
673+ def capture_mini_screenshot_with_cursor (file_path = os .path .join ("screenshots" , "screenshot_mini.png" ), x = 0 , y = 0 ):
674+ user_platform = platform .system ()
675+
676+ if user_platform == "Linux" :
677+ x = float (x [:- 1 ]) # convert x from "50%" to 50.
678+ y = float (y [:- 1 ])
679+
680+ x = (x / 100 ) * monitor_size ['width' ] # convert x from 50 to 0.5 * monitor_width
681+ y = (y / 100 ) * monitor_size ['height' ]
682+
683+ # Define the coordinates for the rectangle
684+ x1 , y1 = int (x - ACCURATE_PIXEL_COUNT / 2 ), int (y - ACCURATE_PIXEL_COUNT / 2 )
685+ x2 , y2 = int (x + ACCURATE_PIXEL_COUNT / 2 ), int (y + ACCURATE_PIXEL_COUNT / 2 )
686+
687+ screenshot = ImageGrab .grab (bbox = (x1 , y1 , x2 , y2 ))
688+ screenshot = screenshot .resize ((screenshot .width * 2 , screenshot .height * 2 ), Image .LANCZOS ) # upscale the image so it's easier to see and percentage marks more visible
689+ screenshot .save (file_path )
690+
691+ screenshots_dir = "screenshots"
692+ grid_screenshot_filename = os .path .join (screenshots_dir , "screenshot_mini_with_grid.png" )
693+
694+ add_grid_to_image (file_path , grid_screenshot_filename , int (ACCURATE_PIXEL_COUNT / 2 ))
695+ elif user_platform == "Darwin" :
696+ x = float (x [:- 1 ]) # convert x from "50%" to 50.
697+ y = float (y [:- 1 ])
698+
699+ x = (x / 100 ) * monitor_size ['width' ] # convert x from 50 to 0.5 * monitor_width
700+ y = (y / 100 ) * monitor_size ['height' ]
701+
702+ x1 , y1 = int (x - ACCURATE_PIXEL_COUNT / 2 ), int (y - ACCURATE_PIXEL_COUNT / 2 )
703+
704+ width = ACCURATE_PIXEL_COUNT
705+ height = ACCURATE_PIXEL_COUNT
706+ # Use the screencapture utility to capture the screen with the cursor
707+ rect = f"-R{ x1 } ,{ y1 } ,{ width } ,{ height } "
708+ subprocess .run (["screencapture" , "-C" , rect , file_path ])
709+
710+ screenshots_dir = "screenshots"
711+ grid_screenshot_filename = os .path .join (screenshots_dir , "screenshot_mini_with_grid.png" )
712+
713+ add_grid_to_image (file_path , grid_screenshot_filename , int (ACCURATE_PIXEL_COUNT / 2 ))
714+
715+
578716def capture_screen_with_cursor (file_path = os .path .join ("screenshots" , "screenshot.png" )):
717+ file_path = os .path .join ("screenshots" , "screenshot.png" )
579718 user_platform = platform .system ()
580719
581720 if user_platform == "Windows" :
@@ -585,8 +724,10 @@ def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot
585724 # Use xlib to prevent scrot dependency for Linux
586725 screen = Xlib .display .Display ().screen ()
587726 size = screen .width_in_pixels , screen .height_in_pixels
727+ monitor_size ["width" ] = size [0 ]
728+ monitor_size ["height" ] = size [1 ]
588729 screenshot = ImageGrab .grab (bbox = (0 , 0 , size [0 ], size [1 ]))
589- screenshot .save (file_path )
730+ screenshot .save (file_path )
590731 elif user_platform == "Darwin" : # (Mac OS)
591732 # Use the screencapture utility to capture the screen with the cursor
592733 subprocess .run (["screencapture" , "-C" , file_path ])
@@ -634,9 +775,16 @@ def main_entry():
634775 default = "gpt-4-vision-preview" ,
635776 )
636777
778+ parser .add_argument (
779+ "-accurate" ,
780+ help = "Activate Reflective Mouse Click Mode" ,
781+ action = "store_true" ,
782+ required = False ,
783+ )
784+
637785 try :
638786 args = parser .parse_args ()
639- main (args .model )
787+ main (args .model , accurate_mode = args . accurate )
640788 except KeyboardInterrupt :
641789 print (f"\n { ANSI_BRIGHT_MAGENTA } Exiting..." )
642790
0 commit comments