@@ -965,6 +965,112 @@ def build_predicate(spec: PredicateSpec | dict[str, Any]) -> Predicate:
965965 raise ValueError (f"Unsupported predicate: { name } " )
966966
967967
968+ # ---------------------------------------------------------------------------
969+ # Extraction Keywords for Markdown-based Text Extraction
970+ # ---------------------------------------------------------------------------
971+
972+ # Keywords that indicate a simple text extraction task suitable for read_markdown()
973+ # These tasks don't need LLM-based extraction - just return the page content as markdown
974+ TEXT_EXTRACTION_KEYWORDS = frozenset ([
975+ # Direct extraction verbs
976+ "extract" ,
977+ "read" ,
978+ "parse" ,
979+ "scrape" ,
980+ "get" ,
981+ "fetch" ,
982+ "retrieve" ,
983+ "capture" ,
984+ "grab" ,
985+ "copy" ,
986+ "pull" ,
987+ # Question words that indicate reading content
988+ "what is" ,
989+ "what are" ,
990+ "what's" ,
991+ "show me" ,
992+ "tell me" ,
993+ "find" ,
994+ "list" ,
995+ "display" ,
996+ # Content-specific patterns
997+ "title" ,
998+ "headline" ,
999+ "heading" ,
1000+ "text" ,
1001+ "content" ,
1002+ "body" ,
1003+ "paragraph" ,
1004+ "article" ,
1005+ "post" ,
1006+ "message" ,
1007+ "description" ,
1008+ "summary" ,
1009+ "excerpt" ,
1010+ # Data extraction patterns
1011+ "price" ,
1012+ "cost" ,
1013+ "amount" ,
1014+ "name" ,
1015+ "label" ,
1016+ "value" ,
1017+ "number" ,
1018+ "date" ,
1019+ "time" ,
1020+ "address" ,
1021+ "email" ,
1022+ "phone" ,
1023+ "rating" ,
1024+ "review" ,
1025+ "comment" ,
1026+ "author" ,
1027+ "username" ,
1028+ # Table/list extraction
1029+ "table" ,
1030+ "row" ,
1031+ "column" ,
1032+ "item" ,
1033+ "entry" ,
1034+ "record" ,
1035+ ])
1036+
1037+
1038+ def _is_text_extraction_task (task : str ) -> bool :
1039+ """
1040+ Determine if a task is a simple text extraction that can use read_markdown().
1041+
1042+ Returns True if the task contains keywords indicating text extraction,
1043+ where returning the page markdown is sufficient without LLM-based extraction.
1044+
1045+ Args:
1046+ task: The task description to analyze
1047+
1048+ Returns:
1049+ True if this is a text extraction task suitable for read_markdown()
1050+ """
1051+ if not task :
1052+ return False
1053+
1054+ task_lower = task .lower ()
1055+
1056+ # Check for extraction keyword patterns using word boundary matching
1057+ # to avoid false positives (e.g., "time" in "sentiment")
1058+ for keyword in TEXT_EXTRACTION_KEYWORDS :
1059+ # Multi-word keywords (like "what is") use substring matching
1060+ if " " in keyword :
1061+ if keyword in task_lower :
1062+ return True
1063+ else :
1064+ # Single-word keywords use word boundary matching via regex
1065+ # Match keyword at word boundaries, allowing for plurals (optional 's' or 'es')
1066+ # e.g., "title" matches "title", "titles", "title's"
1067+ pattern = rf"\b{ re .escape (keyword )} (s|es)?\b"
1068+ if re .search (pattern , task_lower ):
1069+ return True
1070+
1071+ return False
1072+
1073+
9681074# ---------------------------------------------------------------------------
9691075# Plan Normalization and Validation
9701076# ---------------------------------------------------------------------------
@@ -4178,39 +4284,89 @@ async def _execute_step(
41784284
41794285 if action_type == "EXTRACT" :
41804286 action_taken = "EXTRACT"
4181- page = (
4182- getattr (getattr (runtime , "backend" , None ), "page" , None )
4183- or getattr (getattr (runtime , "backend" , None ), "_page" , None )
4184- or getattr (runtime , "_legacy_page" , None )
4287+ # Determine extraction query from step goal or task
4288+ extract_query = step .goal or (
4289+ self ._current_task .task if self ._current_task is not None else "Extract relevant data from the current page"
41854290 )
4186- if page is None :
4187- error = "No page available for EXTRACT"
4188- else :
4189- from types import SimpleNamespace
41904291
4191- from ..read import extract_async
4292+ # Check if this is a text extraction task that can use markdown-based extraction
4293+ use_markdown_extraction = _is_text_extraction_task (extract_query )
41924294
4193- browser_like = SimpleNamespace (page = page )
4194- extract_query = step .goal or (
4195- self ._current_task .task if self ._current_task is not None else "Extract relevant data from the current page"
4196- )
4197- result = await extract_async (
4198- browser_like ,
4199- self .planner ,
4200- query = extract_query ,
4201- schema = None ,
4202- )
4203- llm_resp = getattr (result , "llm_response" , None )
4204- if llm_resp is not None :
4205- self ._record_token_usage ("extract" , llm_resp )
4206- if result .ok :
4207- extraction_succeeded = True
4208- extracted_data = result .data
4295+ if use_markdown_extraction :
4296+ # Step 1: Get page content as markdown (faster than snapshot-based extraction)
4297+ markdown_content = await runtime .read_markdown (max_chars = 8000 )
4298+ if markdown_content :
42094299 if self .config .verbose :
4210- preview = str (result .raw or "" )[:160 ]
4211- print (f" [ACTION] EXTRACT ok: { preview } " , flush = True )
4300+ preview = markdown_content [:160 ].replace ("\n " , " " )
4301+ print (f" [ACTION] EXTRACT - got markdown: { preview } ..." , flush = True )
4302+
4303+ # Step 2: Use LLM (executor) to extract specific data from markdown
4304+ extraction_prompt = f"""You are a text extraction assistant. Given the page content in markdown format, extract the specific information requested.
4305+
4306+ PAGE CONTENT (MARKDOWN):
4307+ { markdown_content }
4308+
4309+ EXTRACTION REQUEST:
4310+ { extract_query }
4311+
4312+ INSTRUCTIONS:
4313+ 1. Read the markdown content carefully
4314+ 2. Find and extract ONLY the specific information requested
4315+ 3. Return ONLY the extracted text, nothing else
4316+ 4. If the information is not found, return "NOT_FOUND"
4317+
4318+ EXTRACTED TEXT:"""
4319+
4320+ resp = self .executor .generate (
4321+ "You extract specific text from markdown content. Return only the extracted text." ,
4322+ extraction_prompt ,
4323+ temperature = 0.0 ,
4324+ max_new_tokens = 500 ,
4325+ )
4326+ self ._record_token_usage ("extract" , resp )
4327+
4328+ extracted_text = resp .content .strip ()
4329+ if extracted_text and extracted_text != "NOT_FOUND" :
4330+ extraction_succeeded = True
4331+ extracted_data = {"text" : extracted_text , "query" : extract_query }
4332+ if self .config .verbose :
4333+ print (f" [ACTION] EXTRACT ok: { extracted_text [:160 ]} " , flush = True )
4334+ else :
4335+ error = f"Could not find requested data: { extract_query } "
4336+ else :
4337+ error = "Failed to extract markdown from page"
4338+ else :
4339+ # Use LLM-based extraction for complex extraction tasks
4340+ page = (
4341+ getattr (getattr (runtime , "backend" , None ), "page" , None )
4342+ or getattr (getattr (runtime , "backend" , None ), "_page" , None )
4343+ or getattr (runtime , "_legacy_page" , None )
4344+ )
4345+ if page is None :
4346+ error = "No page available for EXTRACT"
42124347 else :
4213- error = result .error or "Extraction failed"
4348+ from types import SimpleNamespace
4349+
4350+ from ..read import extract_async
4351+
4352+ browser_like = SimpleNamespace (page = page )
4353+ result = await extract_async (
4354+ browser_like ,
4355+ self .planner ,
4356+ query = extract_query ,
4357+ schema = None ,
4358+ )
4359+ llm_resp = getattr (result , "llm_response" , None )
4360+ if llm_resp is not None :
4361+ self ._record_token_usage ("extract" , llm_resp )
4362+ if result .ok :
4363+ extraction_succeeded = True
4364+ extracted_data = result .data
4365+ if self .config .verbose :
4366+ preview = str (result .raw or "" )[:160 ]
4367+ print (f" [ACTION] EXTRACT ok: { preview } " , flush = True )
4368+ else :
4369+ error = result .error or "Extraction failed"
42144370 elif action_type in ("CLICK" , "TYPE_AND_SUBMIT" ):
42154371 # Try intent heuristics first (if available)
42164372 elements = getattr (ctx .snapshot , "elements" , []) or []
0 commit comments