diff --git a/pyproject.toml b/pyproject.toml index 74c355f5..78c7f3b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "uipath-langchain" -version = "0.7.2" +version = "0.7.3" description = "Python SDK that enables developers to build and deploy LangGraph agents to the UiPath Cloud Platform" readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" diff --git a/src/uipath_langchain/agent/react/agent.py b/src/uipath_langchain/agent/react/agent.py index 15b4e901..96c8750f 100644 --- a/src/uipath_langchain/agent/react/agent.py +++ b/src/uipath_langchain/agent/react/agent.py @@ -6,9 +6,12 @@ from langgraph.constants import END, START from langgraph.graph import StateGraph from pydantic import BaseModel +from uipath.platform.context_grounding import DeepRagContent from uipath.platform.guardrails import BaseGuardrail +from ...runtime._citations import cas_deep_rag_citation_wrapper from ..guardrails.actions import GuardrailAction +from ..tools.structured_tool_with_output_type import StructuredToolWithOutputType from .guardrails.guardrails_subgraph import ( create_agent_init_guardrails_subgraph, create_agent_terminate_guardrails_subgraph, @@ -74,6 +77,15 @@ def create_agent( init_node = create_init_node(messages, input_schema, config.is_conversational) tool_nodes = create_tool_node(agent_tools) + + # for conversational agents we transform deeprag's citation format into cas's + if config.is_conversational: + for node in tool_nodes.values(): + if isinstance(node.tool, StructuredToolWithOutputType) and issubclass( + node.tool.output_type, DeepRagContent + ): + node.awrapper = cas_deep_rag_citation_wrapper + tool_nodes_with_guardrails = create_tools_guardrails_subgraph( tool_nodes, guardrails, input_schema=input_schema ) diff --git a/src/uipath_langchain/agent/tools/context_tool.py b/src/uipath_langchain/agent/tools/context_tool.py index a82ab81f..d574a6f6 100644 --- a/src/uipath_langchain/agent/tools/context_tool.py +++ b/src/uipath_langchain/agent/tools/context_tool.py @@ -188,6 +188,7 @@ async def create_deep_rag(): index_name=index_name, prompt=actual_prompt, citation_mode=citation_mode, + index_folder_path=resource.folder_path, ) return await create_deep_rag() diff --git a/src/uipath_langchain/runtime/_citations.py b/src/uipath_langchain/runtime/_citations.py index 412126f4..7032687f 100644 --- a/src/uipath_langchain/runtime/_citations.py +++ b/src/uipath_langchain/runtime/_citations.py @@ -1,11 +1,16 @@ from __future__ import annotations +import json import logging import re from dataclasses import dataclass +from typing import Any from uuid import uuid4 +from langchain_core.messages.tool import ToolCall +from langchain_core.tools import BaseTool from uipath.core.chat import ( + UiPathConversationCitationData, UiPathConversationCitationEndEvent, UiPathConversationCitationEvent, UiPathConversationCitationSourceMedia, @@ -80,6 +85,37 @@ def _parse_citations(text: str) -> list[tuple[str, _ParsedCitation | None]]: return segments +def _make_source( + citation: _ParsedCitation, + source_numbers: dict[_ParsedCitation, int], + next_number: int, +) -> tuple[ + UiPathConversationCitationSourceUrl | UiPathConversationCitationSourceMedia, int +]: + """Build a citation source, deduplicating by assigning numbers""" + if citation not in source_numbers: + source_numbers[citation] = next_number + next_number += 1 + number = source_numbers[citation] + + source: UiPathConversationCitationSourceUrl | UiPathConversationCitationSourceMedia + if citation.url is not None: + source = UiPathConversationCitationSourceUrl( + title=citation.title, + number=number, + url=citation.url, + ) + else: + source = UiPathConversationCitationSourceMedia( + title=citation.title, + number=number, + mime_type=None, + download_url=citation.reference, + page_number=citation.page_number, + ) + return source, next_number + + def _find_partial_tag_start(text: str) -> int: _TAG_PREFIX = " list[UiPathConversationContentPartChunkEvent]: self._buffer = "" return self._process_segments(remaining) + + +def extract_citations_from_text( + text: str, +) -> tuple[str, list[UiPathConversationCitationData]]: + """Parse inline tags from text and return cleaned text with structured citations.""" + segments = _parse_citations(text) + if not segments: + return (text, []) + + source_numbers: dict[_ParsedCitation, int] = {} + next_number = 1 + cleaned_parts: list[str] = [] + citations: list[UiPathConversationCitationData] = [] + offset = 0 + + for segment_text, citation in segments: + cleaned_parts.append(segment_text) + length = len(segment_text) + + if citation is not None: + source, next_number = _make_source(citation, source_numbers, next_number) + if length > 0: + citations.append( + UiPathConversationCitationData( + offset=offset, + length=length, + sources=[source], + ) + ) + elif citations: + # Back-to-back citation with no preceding text: + # merge into the previous citation's sources (one citation data with two sources) + citations[-1].sources.append(source) + + offset += length + + return ("".join(cleaned_parts), citations) + + +def _escape_attr(value: str) -> str: + """Escape only characters that would break XML attribute parsing.""" + return value.replace('"', """) + + +def convert_citations_to_inline_tags(content: dict[str, Any]) -> str: + """Replace [ordinal] references in DeepRag text with tags.""" + text = content.get("text", "") + citations = content.get("citations", []) + + citation_map: dict[int, dict[str, Any]] = {} + for c in citations: + ordinal = c.get("ordinal") + if ordinal is not None: + citation_map[ordinal] = c + + for ordinal, c in citation_map.items(): + title = _escape_attr(str(c.get("source", ""))) + reference = _escape_attr(str(c.get("reference", ""))) + page_number = _escape_attr(str(c.get("pageNumber", c.get("page_number", "")))) + tag = ( + f'' + ) + text = text.replace(f"[{ordinal}]", tag) + + return text + + +async def cas_deep_rag_citation_wrapper(tool: BaseTool, call: ToolCall): + """Transform DeepRag results into CAS's inline tags.""" + result = await tool.ainvoke(call) + try: + data = json.loads(result.content) + result.content = json.dumps({"text": convert_citations_to_inline_tags(data)}) + except Exception: + logger.warning( + "Failed to transform DeepRag citations, returning raw result", exc_info=True + ) + return result diff --git a/src/uipath_langchain/runtime/messages.py b/src/uipath_langchain/runtime/messages.py index 53dce14c..ed44f1ee 100644 --- a/src/uipath_langchain/runtime/messages.py +++ b/src/uipath_langchain/runtime/messages.py @@ -39,7 +39,7 @@ ) from uipath.runtime import UiPathRuntimeStorageProtocol -from ._citations import CitationStreamProcessor +from ._citations import CitationStreamProcessor, extract_citations_from_text logger = logging.getLogger(__name__) @@ -626,11 +626,12 @@ def _map_langchain_ai_message_to_uipath_message_data( content_parts: list[UiPathConversationContentPartData] = [] text_content = UiPathChatMessagesMapper._extract_text(message.content) if text_content: + cleaned_text, citations = extract_citations_from_text(text_content) content_parts.append( UiPathConversationContentPartData( mime_type="text/markdown", - data=UiPathInlineValue(inline=text_content), - citations=[], # TODO: Citations + data=UiPathInlineValue(inline=cleaned_text), + citations=citations, ) ) diff --git a/tests/runtime/chat_message_mapper.py b/tests/runtime/chat_message_mapper.py index 2f3c7f33..6c75f877 100644 --- a/tests/runtime/chat_message_mapper.py +++ b/tests/runtime/chat_message_mapper.py @@ -12,6 +12,8 @@ ToolMessage, ) from uipath.core.chat import ( + UiPathConversationCitationSourceMedia, + UiPathConversationCitationSourceUrl, UiPathConversationContentPart, UiPathConversationMessage, UiPathExternalValue, @@ -1618,3 +1620,73 @@ def test_extracts_text_from_content_blocks(self): assert len(result[0].content_parts) == 1 assert isinstance(result[0].content_parts[0].data, UiPathInlineValue) assert result[0].content_parts[0].data.inline == "first part second part" + + +class TestMapLangChainAIMessageCitations: + """Tests for citation extraction in _map_langchain_ai_message_to_uipath_message_data.""" + + def test_ai_message_with_citation_tags_populates_citations(self): + """AIMessage with inline citation tags should have citations populated and text cleaned.""" + messages: list[AnyMessage] = [ + AIMessage( + content='Some fact and more.' + ) + ] + + result = ( + UiPathChatMessagesMapper.map_langchain_messages_to_uipath_message_data_list( + messages + ) + ) + + assert len(result) == 1 + part = result[0].content_parts[0] + assert isinstance(part.data, UiPathInlineValue) + assert part.data.inline == "Some fact and more." + assert len(part.citations) == 1 + assert part.citations[0].offset == 0 + assert part.citations[0].length == 9 # "Some fact" + source = part.citations[0].sources[0] + assert isinstance(source, UiPathConversationCitationSourceUrl) + assert source.url == "https://doc.com" + assert source.title == "Doc" + + def test_ai_message_without_citation_tags_has_empty_citations(self): + """AIMessage without citation tags should have empty citations list.""" + messages: list[AnyMessage] = [AIMessage(content="Plain text response")] + + result = ( + UiPathChatMessagesMapper.map_langchain_messages_to_uipath_message_data_list( + messages + ) + ) + + assert len(result) == 1 + part = result[0].content_parts[0] + assert isinstance(part.data, UiPathInlineValue) + assert part.data.inline == "Plain text response" + assert part.citations == [] + + def test_ai_message_with_media_citation(self): + """AIMessage with reference/media citation tag should produce media source.""" + messages: list[AnyMessage] = [ + AIMessage( + content='A finding' + ) + ] + + result = ( + UiPathChatMessagesMapper.map_langchain_messages_to_uipath_message_data_list( + messages + ) + ) + + assert len(result) == 1 + part = result[0].content_parts[0] + assert isinstance(part.data, UiPathInlineValue) + assert part.data.inline == "A finding" + assert len(part.citations) == 1 + source = part.citations[0].sources[0] + assert isinstance(source, UiPathConversationCitationSourceMedia) + assert source.download_url == "https://r.com" + assert source.page_number == "3" diff --git a/tests/runtime/test_citations.py b/tests/runtime/test_citations.py index 3bb6cbcf..99ea08f6 100644 --- a/tests/runtime/test_citations.py +++ b/tests/runtime/test_citations.py @@ -1,6 +1,11 @@ """Tests for the CitationStreamProcessor and citation parsing utilities.""" # mypy: disable-error-code="union-attr,operator" +import json +from unittest.mock import AsyncMock + +import pytest +from langchain_core.messages.tool import ToolCall, ToolMessage from uipath.core.chat import ( UiPathConversationCitationSourceMedia, UiPathConversationCitationSourceUrl, @@ -9,6 +14,9 @@ from uipath_langchain.runtime._citations import ( CitationStreamProcessor, _find_partial_tag_start, + cas_deep_rag_citation_wrapper, + convert_citations_to_inline_tags, + extract_citations_from_text, ) @@ -533,3 +541,427 @@ def test_uip_prefix_followed_by_citation_single_chunk(self): assert len(cited) == 1 assert cited[0].data == "' + ' and second' + ) + cleaned, citations = extract_citations_from_text(text) + assert cleaned == "First and second" + assert len(citations) == 2 + # First citation: "First" at offset 0, length 5 + assert citations[0].offset == 0 + assert citations[0].length == 5 + assert citations[0].sources[0].url == "https://a.com" + # Second citation: " and second" at offset 5, length 11 + assert citations[1].offset == 5 + assert citations[1].length == 11 + assert citations[1].sources[0].url == "https://b.com" + + def test_duplicate_sources_same_number(self): + """Duplicate sources (same title+url) get the same number.""" + text = ( + 'A' + 'B' + ) + cleaned, citations = extract_citations_from_text(text) + assert cleaned == "AB" + assert len(citations) == 2 + assert citations[0].sources[0].number == citations[1].sources[0].number + + def test_back_to_back_citations_merged_into_previous(self): + """Back-to-back citations merge the second source into the previous citation.""" + text = ( + 'Text' + '' + ) + cleaned, citations = extract_citations_from_text(text) + assert cleaned == "Text" + assert len(citations) == 1 + assert len(citations[0].sources) == 2 + assert citations[0].sources[0].url == "https://a.com" + assert citations[0].sources[1].url == "https://b.com" + + def test_three_back_to_back_citations(self): + """Three back-to-back citations all merge into one citation with three sources.""" + text = ( + 'Answer' + '' + '' + ) + cleaned, citations = extract_citations_from_text(text) + assert cleaned == "Answer" + assert len(citations) == 1 + assert len(citations[0].sources) == 3 + assert citations[0].sources[0].title == "A" + assert citations[0].sources[1].title == "B" + assert citations[0].sources[2].title == "C" + + def test_back_to_back_citations_at_start_with_no_preceding_text(self): + """Back-to-back citations at the very start (no preceding text) are all dropped.""" + text = ( + '' + '' + ) + cleaned, citations = extract_citations_from_text(text) + assert cleaned == "" + # First citation has no preceding text and no previous citation to merge into + # Second citation also has no preceding text but merges into the first + # Both end up dropped since neither has a text span + assert len(citations) == 0 + + def test_empty_text(self): + """Empty string returns empty text and no citations.""" + cleaned, citations = extract_citations_from_text("") + assert cleaned == "" + assert citations == [] + + def test_text_with_trailing_content(self): + """Citation in middle of text, trailing text preserved.""" + text = 'A fact and more text.' + cleaned, citations = extract_citations_from_text(text) + assert cleaned == "A fact and more text." + assert len(citations) == 1 + assert citations[0].offset == 0 + assert citations[0].length == 6 # len("A fact") + + def test_different_sources_get_different_numbers(self): + """Different sources get incrementing numbers.""" + text = ( + 'A' + 'B' + ) + cleaned, citations = extract_citations_from_text(text) + assert citations[0].sources[0].number == 1 + assert citations[1].sources[0].number == 2 + + +class TestConvertCitationsToInlineTags: + """Test cases for convert_citations_to_inline_tags function.""" + + def test_basic_replacement(self): + """Test basic [1] and [2] replacement with tags.""" + content = { + "text": "Fact A [1] and fact B [2].", + "citations": [ + { + "ordinal": 1, + "pageNumber": 3, + "source": "Report.pdf", + "reference": "https://example.com/ref1", + }, + { + "ordinal": 2, + "pageNumber": 7, + "source": "Manual.pdf", + "reference": "https://example.com/ref2", + }, + ], + } + + result = convert_citations_to_inline_tags(content) + + assert "[1]" not in result + assert "[2]" not in result + assert ( + '' + in result + ) + assert ( + '' + in result + ) + + def test_same_ordinal_used_twice(self): + """Test that the same ordinal appearing twice in text is replaced in both places.""" + content = { + "text": "First mention [1] and second mention [1].", + "citations": [ + { + "ordinal": 1, + "pageNumber": 5, + "source": "Doc.pdf", + "reference": "https://example.com/doc", + }, + ], + } + + result = convert_citations_to_inline_tags(content) + + assert "[1]" not in result + assert result.count('' in result + + def test_back_to_back_ordinals(self): + """Test [1][2] back-to-back at end of text are both converted.""" + content = { + "text": "Answer [1][2]", + "citations": [ + { + "ordinal": 1, + "pageNumber": 1, + "source": "A.pdf", + "reference": "https://a.com", + }, + { + "ordinal": 2, + "pageNumber": 5, + "source": "B.pdf", + "reference": "https://b.com", + }, + ], + } + + result = convert_citations_to_inline_tags(content) + + assert "[1]" not in result + assert "[2]" not in result + assert '