Merge branch 'OthersideAI:main' into installer-script

centopw · web-flow · commit 45ba6e65aa90 · 2023-12-01T14:08:01.000+01:00
diff --git a/README.md b/README.md
@@ -20,7 +20,12 @@
 > **Note:** GPT-4V's error rate in estimating XY mouse click locations is currently quite high. This framework aims to track the progress of multimodal models over time, aspiring to achieve human-level performance in computer operation.
 
 ### Ongoing Development
-At [HyperwriteAI](https://www.hyperwriteai.com/), we are developing a multimodal model with more accurate click location predictions.
+At [HyperwriteAI](https://www.hyperwriteai.com/), we are developing Agent-1-Vision a multimodal model with more accurate click location predictions.
+
+### Agent-1-Vision Model API Access
+We will soon be offering API access to our Agent-1-Vision model.
+
+If you're interested in gaining access to this API, sign up [here](https://othersideai.typeform.com/to/FszaJ1k8?typeform-source=www.hyperwriteai.com).
 
 ### Additional Thoughts
 We recognize that some operating system functions may be more efficiently executed with hotkeys such as entering the Browser Address bar using `command + L` rather than by simulating a mouse click at the correct XY location. We plan to make these improvements over time. However, it's important to note that many actions require the accurate selection of visual elements on the screen, necessitating precise XY mouse click locations. A primary focus of this project is to refine the accuracy of determining these click locations. We believe this is essential for achieving a fully self-operating computer in the current technological landscape.
@@ -83,7 +88,7 @@ operate
 - **Adding New Multimodal Models**: Integration of new multimodal models is welcomed. If you have a specific model in mind that you believe would be a valuable addition, please feel free to integrate it and submit a PR.
 - **Framework Architecture Improvements**: Think you can enhance the framework architecture described in the intro? We welcome suggestions and PRs.
 
-For any input on improving this project, feel free to reach out to me on [Twitter](https://twitter.com/josh_bickett).
+For any input on improving this project, feel free to reach out to [Josh](https://twitter.com/josh_bickett) on Twitter.
 
 ### Follow HyperWriteAI for More Updates
 
@@ -92,4 +97,4 @@ Stay updated with the latest developments:
 - Follow HyperWriteAI on [LinkedIn](https://www.linkedin.com/company/othersideai/).
 
 ### Compatibility
-- This project is only compatible with MacOS at this time. 
+- This project is compatible with Mac OS, Windows, and Linux (with X server installed).
diff --git a/operate/main.py b/operate/main.py
@@ -11,12 +11,13 @@
 import pyautogui
 import argparse
 import platform
+import Xlib.display
 
 from prompt_toolkit import prompt
 from prompt_toolkit.shortcuts import message_dialog
 from prompt_toolkit.styles import Style as PromptStyle
 from dotenv import load_dotenv
-from PIL import Image, ImageDraw, ImageFont
+from PIL import Image, ImageDraw, ImageFont, ImageGrab
 import matplotlib.font_manager as fm
 from openai import OpenAI
 
@@ -27,6 +28,7 @@
 
 client = OpenAI()
 client.api_key = os.getenv("OPENAI_API_KEY")
+client.base_url = os.getenv("OPENAI_API_BASE_URL", client.base_url)
 
 VISION_PROMPT = """
 You are a Self-Operating Computer. You use the same operating system as a human.
@@ -62,7 +64,7 @@
 Objective: Open Spotify and play the beatles
 SEARCH Spotify
 __
-Objective: Find a image of a banana
+Objective: Find an image of a banana
 CLICK {{ "x": "50%", "y": "60%", "description": "Click: Google Search field", "reason": "This will allow me to search for a banana" }} 
 __
 Objective: Go buy a book about the history of the internet
@@ -178,10 +180,9 @@ def main(model):
     }
     messages = [assistant_message, user_message]
 
-    looping = True
     loop_count = 0
 
-    while looping:
+    while True:
         if DEBUG:
             print("[loop] messages before next action:\n\n\n", messages[1:])
         try:
@@ -194,25 +195,21 @@ def main(model):
             print(
                 f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}"
             )
-            looping = False
             break
         except Exception as e:
             print(
                 f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] -> {e} {ANSI_RESET}"
             )
-            looping = False
             break
 
         if action_type == "DONE":
             print(
                 f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Objective complete {ANSI_RESET}"
             )
-            looping = False
             summary = summarize(messages, objective)
             print(
                 f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BLUE} Summary\n{ANSI_RESET}{summary}"
             )
-
             break
 
         if action_type != "UNKNOWN":
@@ -234,8 +231,8 @@ def main(model):
             print(
                 f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_RED}[Error] AI response\n{ANSI_RESET}{response}"
             )
-            looping = False
             break
+
         print(
             f"{ANSI_GREEN}[Self-Operating Computer]{ANSI_BRIGHT_MAGENTA} [Act] {action_type} COMPLETE {ANSI_RESET}{function_response}"
         )
@@ -248,7 +245,7 @@ def main(model):
 
         loop_count += 1
         if loop_count > 10:
-            looping = False
+            break
 
 
 def format_summary_prompt(objective):
@@ -561,12 +558,23 @@ def search(text):
 
 
 def capture_screen_with_cursor(file_path=os.path.join("screenshots", "screenshot.png")):
-    # Use the screencapture utility to capture the screen with the cursor
-    if platform.system() == "Windows":
+    user_platform = platform.system()
+    
+    if user_platform == "Windows":
         screenshot = pyautogui.screenshot()
         screenshot.save(file_path)
-    else:
+    elif user_platform == "Linux":
+        # Use xlib to prevent scrot dependency for Linux
+        screen = Xlib.display.Display().screen()
+        size = screen.width_in_pixels, screen.height_in_pixels
+        screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1]))
+        screenshot.save(file_path)
+    elif user_platform == "Darwin": # (Mac OS)
+        # Use the screencapture utility to capture the screen with the cursor
         subprocess.run(["screencapture", "-C", file_path])
+    else:
+        print(f"The platform you're using ({user_platform}) is not currently supported")
+
 
 def extract_json_from_string(s):
     # print("extracting json from string", s)
diff --git a/requirements.txt b/requirements.txt
@@ -34,6 +34,7 @@ pyperclip==1.8.2
 PyRect==0.2.0
 pyscreenshot==3.1
 PyScreeze==0.1.29
+python3-xlib==0.15
 python-dateutil==2.8.2
 python-dotenv==1.0.0
 pytweening==1.0.7