Merge pull request #50 from AnswerDotAI/enhance-url2note

jph00 · web-flow · commit b638d710d5b1 · 2025-12-03T07:54:52.000+10:00
enhance url2note
diff --git a/dialoghelper/core.py b/dialoghelper/core.py
@@ -272,12 +272,16 @@ def run_msg(
 def url2note(
     url:str, # URL to read
     extract_section:bool=True, # If url has an anchor, return only that section
-    selector:str=None # Select section(s) using BeautifulSoup.select (overrides extract_section)
+    selector:str=None, # Select section(s) using BeautifulSoup.select (overrides extract_section)
+    ai_img:bool=True, # Make images visible to the AI
+    split_re:str=r'(?=^#{1,6} .+)' # Regex to split content into multiple notes, set to False for single note
 ):
-    "Read URL as markdown, and add a note below current message with the result"
-    res = read_url(url, as_md=True, extract_section=extract_section, selector=selector)
+    "Read URL as markdown, and add note(s) below current message with the result"
+    res = read_url(url, as_md=True, extract_section=extract_section, selector=selector, ai_img=ai_img)
+    if split_re: return [add_msg(s) for s in re.split(split_re, res, flags=re.MULTILINE) if s.strip()]
     return add_msg(res)
 
+
 # %% ../nbs/00_core.ipynb
 def ast_py(code:str):
     "Get an SgRoot root node for python `code`"
diff --git a/nbs/00_core.ipynb b/nbs/00_core.ipynb
@@ -645,7 +645,7 @@
      "output_type": "stream",
      "text": [
       "_9c544573\n",
-      "\n"
+      "_9558b075\n"
      ]
     }
    ],
@@ -855,6 +855,16 @@
     "_edit_id = add_msg('This message should be found.\\n\\nThis is a multiline message.')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "57797a25",
+   "metadata": {},
+   "source": [
+    "This message should be found.\n",
+    "\n",
+    "This is a multiline message."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -936,6 +946,16 @@
    "id": "6e354677",
    "metadata": {},
    "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
     {
      "data": {
       "text/plain": [
@@ -993,11 +1013,14 @@
     "def url2note(\n",
     "    url:str, # URL to read\n",
     "    extract_section:bool=True, # If url has an anchor, return only that section\n",
-    "    selector:str=None # Select section(s) using BeautifulSoup.select (overrides extract_section)\n",
+    "    selector:str=None, # Select section(s) using BeautifulSoup.select (overrides extract_section)\n",
+    "    ai_img:bool=True, # Make images visible to the AI\n",
+    "    split_re:str=r'(?=^#{1,6} .+)' # Regex to split content into multiple notes, set to False for single note\n",
     "):\n",
-    "    \"Read URL as markdown, and add a note below current message with the result\"\n",
-    "    res = read_url(url, as_md=True, extract_section=extract_section, selector=selector)\n",
-    "    return add_msg(res)"
+    "    \"Read URL as markdown, and add note(s) below current message with the result\"\n",
+    "    res = read_url(url, as_md=True, extract_section=extract_section, selector=selector, ai_img=ai_img)\n",
+    "    if split_re: return [add_msg(s) for s in re.split(split_re, res, flags=re.MULTILINE) if s.strip()]\n",
+    "    return add_msg(res)\n"
    ]
   },
   {
@@ -1020,6 +1043,26 @@
     "del_msg(_id)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43554bd9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_ids = url2note('https://www.answer.ai/posts/2025-10-01-cachy.html')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b02115e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_ = [del_msg(i) for i in _ids]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1089,25 +1132,25 @@
     {
      "data": {
       "text/plain": [
-       "[(\"xpost('http://localhost:5001/pop_data_blocking_', data={'data_id': idx})\",\n",
-       "  {'B': {'text': \"{'data_id': idx}\",\n",
-       "    'range': {'byteOffset': {'start': 1185, 'end': 1201},\n",
-       "     'start': {'line': 38, 'column': 72},\n",
-       "     'end': {'line': 38, 'column': 88}}},\n",
-       "   'A': {'text': \"'http://localhost:5001/pop_data_blocking_'\",\n",
-       "    'range': {'byteOffset': {'start': 1136, 'end': 1178},\n",
-       "     'start': {'line': 38, 'column': 23},\n",
-       "     'end': {'line': 38, 'column': 65}}}},\n",
-       "  'dialoghelper/experimental.py'),\n",
-       " ('xpost(f\\'http://localhost:{dh_settings[\"port\"]}/{path}\\', data=data)',\n",
-       "  {'A': {'text': 'f\\'http://localhost:{dh_settings[\"port\"]}/{path}\\'',\n",
-       "    'range': {'byteOffset': {'start': 2624, 'end': 2672},\n",
-       "     'start': {'line': 70, 'column': 16},\n",
-       "     'end': {'line': 70, 'column': 64}}},\n",
-       "   'B': {'text': 'data',\n",
-       "    'range': {'byteOffset': {'start': 2679, 'end': 2683},\n",
+       "[('xpost(f\\'http://localhost:{dh_settings[\"port\"]}/{path}\\', data=data)',\n",
+       "  {'B': {'text': 'data',\n",
+       "    'range': {'byteOffset': {'start': 2753, 'end': 2757},\n",
        "     'start': {'line': 70, 'column': 71},\n",
-       "     'end': {'line': 70, 'column': 75}}}},\n",
+       "     'end': {'line': 70, 'column': 75}}},\n",
+       "   'A': {'text': 'f\\'http://localhost:{dh_settings[\"port\"]}/{path}\\'',\n",
+       "    'range': {'byteOffset': {'start': 2698, 'end': 2746},\n",
+       "     'start': {'line': 70, 'column': 16},\n",
+       "     'end': {'line': 70, 'column': 64}}}},\n",
+       "  'dialoghelper/core.py'),\n",
+       " (\"xpost(url, data={'data_id': idx, 'timeout': timeout})\",\n",
+       "  {'B': {'text': \"{'data_id': idx, 'timeout': timeout}\",\n",
+       "    'range': {'byteOffset': {'start': 4450, 'end': 4486},\n",
+       "     'start': {'line': 121, 'column': 36},\n",
+       "     'end': {'line': 121, 'column': 72}}},\n",
+       "   'A': {'text': 'url',\n",
+       "    'range': {'byteOffset': {'start': 4440, 'end': 4443},\n",
+       "     'start': {'line': 121, 'column': 26},\n",
+       "     'end': {'line': 121, 'column': 29}}}},\n",
        "  'dialoghelper/core.py')]"
       ]
      },
@@ -1156,7 +1199,7 @@
     {
      "data": {
       "text/plain": [
-       "{'success': 'Inserted text after line 5 in message _c3581eea'}"
+       "{'success': 'Inserted text after line 5 in message _f813f590'}"
       ]
      },
      "execution_count": null,
@@ -1220,7 +1263,7 @@
     {
      "data": {
       "text/plain": [
-       "{'success': 'Replaced text in message _c3581eea'}"
+       "{'success': 'Replaced text in message _f813f590'}"
       ]
      },
      "execution_count": null,
@@ -1282,7 +1325,7 @@
     {
      "data": {
       "text/plain": [
-       "{'success': 'Successfully replaced all the strings in message _c3581eea'}"
+       "{'success': 'Successfully replaced all the strings in message _f813f590'}"
       ]
      },
      "execution_count": null,
@@ -1345,7 +1388,7 @@
     {
      "data": {
       "text/plain": [
-       "{'success': 'Replaced lines 2 to 4 in message _c3581eea'}"
+       "{'success': 'Replaced lines 2 to 4 in message _f813f590'}"
       ]
      },
      "execution_count": null,
@@ -1818,9 +1861,12 @@
       "- &`find_var`: Search for var in all frames of the call stack\n",
       "- &`set_var`: Set var to val after finding it in all frames of the call stack\n",
       "- &`find_dname`: Get the message id by searching the call stack for __dialog_id.\n",
-      "- &`find_msg_id`: Get the message id by searching the call stack for __dialog_id.\n",
+      "- &`find_msg_id`: Get the message id by searching the call stack for __msg_id.\n",
       "- &`curr_dialog`: Get the current dialog info.\n",
       "- &`msg_idx`: Get absolute index of message in dialog.\n",
+      "- &`add_scr`: Swap a script element to the end of the js-script element\n",
+      "- &`iife`: Wrap javascript code string in an IIFE and execute it via `add_html`\n",
+      "- &`event_get`: Call `fire_event` and then `pop_data` to get a response\n",
       "- &`find_msgs`: Find `list[dict]` of messages in current specific dialog that contain the given information. To refer to a message found later, use its `id` field.\n",
       "- &`add_html`: Send HTML to the browser to be swapped into the DOM\n",
       "- &`read_msg`: Get the message indexed in the current dialog.\n",
@@ -1833,7 +1879,7 @@
       "    - Use `content` param to update contents.\n",
       "    - Only include parameters to update--missing ones will be left unchanged.\n",
       "- &`run_msg`: Adds a message to the run queue. Use read_msg to see the output once it runs.\n",
-      "- &`url2note`: Read URL as markdown, and add a note below current message with the result\n",
+      "- &`url2note`: Read URL as markdown, and add note(s) below current message with the result\n",
       "- &`ast_py`: Get an SgRoot root node for python `code`\n",
       "- &`ast_grep`: Use the `ast-grep` command to find `pattern` in `path`\n",
       "- &`msg_insert_line`: Insert text at a specific line number in a message\n",