diff --git a/docs/assets/use_cases/research_agent/research_ai_agent.ipynb b/docs/assets/use_cases/research_agent/research_ai_agent.ipynb
index 1edeb93aa..e6c6c3d56 100644
--- a/docs/assets/use_cases/research_agent/research_ai_agent.ipynb
+++ b/docs/assets/use_cases/research_agent/research_ai_agent.ipynb
@@ -1,928 +1,877 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "imQS4g52zsHM"
- },
- "source": [
- "### Installing required packages"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "jrWsiNN_zsHO"
- },
- "outputs": [],
- "source": [
- "%%capture\n",
- "!pip3 install openai pandas sentence-transformers transformers superlinked==19.21.1"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "qWYevol5zsHO"
- },
- "source": [
- "### Setting up the Imports and setting up the intial library checks"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "idNHQDiGzsHP",
- "outputId": "d5b4f221-ab00-44db-d44f-cbcd34220f62"
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/vipul/Nova/Work/Superlinked/superlinked-agent/env/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
- " from .autonotebook import tqdm as notebook_tqdm\n"
- ]
- }
- ],
- "source": [
- "import pandas as pd\n",
- "import superlinked.framework as sl\n",
- "from datetime import timedelta\n",
- "from sentence_transformers import SentenceTransformer\n",
- "from openai import OpenAI\n",
- "import os\n",
- "from abc import ABC, abstractmethod\n",
- "from typing import Any, Optional, Dict\n",
- "from tqdm import tqdm\n",
- "from google.colab import userdata\n",
- "\n",
- "# Abstract Tool Class\n",
- "class Tool(ABC):\n",
- " @abstractmethod\n",
- " def name(self) -> str:\n",
- " pass\n",
- "\n",
- " @abstractmethod\n",
- " def description(self) -> str:\n",
- " pass\n",
- "\n",
- " @abstractmethod\n",
- " def use(self, *args, **kwargs) -> Any:\n",
- " pass\n",
- "\n",
- "\n",
- "# Get API key from Google Colab secrets\n",
- "try:\n",
- " api_key = userdata.get('OPENAI_API_KEY')\n",
- "except KeyError:\n",
- " raise ValueError(\"OPENAI_API_KEY not found in user secrets. Please add it using Tools > User secrets.\")\n",
- "\n",
- "# Initialize OpenAI Client\n",
- "api_key = os.environ.get(\"OPENAI_API_KEY\", \"your-openai-key\") # Replace with your OpenAI API key\n",
- "if not api_key:\n",
- " raise ValueError(\"Please set the OPENAI_API_KEY environment variable.\")\n",
- "\n",
- "client = OpenAI(api_key=api_key)\n",
- "model = \"gpt-4\""
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "b5jxe_IAzsHQ"
- },
- "source": [
- "### Downloading the relevant data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "72bCIO_uzsHQ",
- "outputId": "0d410eaf-d7c9-48f3-d786-69231a9277c8"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "--2025-03-13 22:39:51-- https://drive.google.com/uc?export=download&id=1FCR3TW5yLjGhEmm-Uclw0_5PWVEaLk1j\n",
- "Resolving drive.google.com (drive.google.com)... 142.250.207.238\n",
- "Connecting to drive.google.com (drive.google.com)|142.250.207.238|:443... connected.\n",
- "HTTP request sent, awaiting response... 303 See Other\n",
- "Location: https://drive.usercontent.google.com/download?id=1FCR3TW5yLjGhEmm-Uclw0_5PWVEaLk1j&export=download [following]\n",
- "--2025-03-13 22:39:52-- https://drive.usercontent.google.com/download?id=1FCR3TW5yLjGhEmm-Uclw0_5PWVEaLk1j&export=download\n",
- "Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 142.250.193.225\n",
- "Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|142.250.193.225|:443... connected.\n",
- "HTTP request sent, awaiting response... 200 OK\n",
- "Length: 13941471 (13M) [application/octet-stream]\n",
- "Saving to: ‘arxiv_ai_data.csv’\n",
- "\n",
- "arxiv_ai_data.csv 100%[===================>] 13.29M 1.90MB/s in 7.3s \n",
- "\n",
- "2025-03-13 22:40:13 (1.81 MB/s) - ‘arxiv_ai_data.csv’ saved [13941471/13941471]\n",
- "\n"
- ]
- }
- ],
- "source": [
- "import pandas as pd\n",
- "\n",
- "!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1FCR3TW5yLjGhEmm-Uclw0_5PWVEaLk1j' -O arxiv_ai_data.csv"
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "### Data Loading and Truncation\n",
- "\n",
- "To improve loading times, especially for users on the free Colab tier, the dataset is truncated to 100 rows by default. This means that only the first 100 entries from the original CSV file are loaded initially. You can change it to use the full dataset if you want!"
- ],
- "metadata": {
- "id": "vSEKGNESfakX"
- }
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "vtgyd3xrzsHQ",
- "outputId": "ecf8a1aa-5ad1-48a9-a506-8f6b412cd547"
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " authors | \n",
- " categories | \n",
- " comment | \n",
- " doi | \n",
- " entry_id | \n",
- " journal_ref | \n",
- " pdf_url | \n",
- " primary_category | \n",
- " published | \n",
- " summary | \n",
- " title | \n",
- " updated | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " [arxiv.Result.Author('M. L. Ginsberg')] | \n",
- " ['cs.AI'] | \n",
- " See http://www.jair.org/ for an online appendi... | \n",
- " NaN | \n",
- " http://arxiv.org/abs/cs/9308101v1 | \n",
- " Journal of Artificial Intelligence Research, V... | \n",
- " http://arxiv.org/pdf/cs/9308101v1 | \n",
- " cs.AI | \n",
- " 1993-08-01 00:00:00+00:00 | \n",
- " Because of their occasional need to return to ... | \n",
- " Dynamic Backtracking | \n",
- " 1993-08-01 00:00:00+00:00 | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " [arxiv.Result.Author('M. P. Wellman')] | \n",
- " ['cs.AI'] | \n",
- " See http://www.jair.org/ for any accompanying ... | \n",
- " NaN | \n",
- " http://arxiv.org/abs/cs/9308102v1 | \n",
- " Journal of Artificial Intelligence Research, V... | \n",
- " http://arxiv.org/pdf/cs/9308102v1 | \n",
- " cs.AI | \n",
- " 1993-08-01 00:00:00+00:00 | \n",
- " Market price systems constitute a well-underst... | \n",
- " A Market-Oriented Programming Environment and ... | \n",
- " 1993-08-01 00:00:00+00:00 | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " [arxiv.Result.Author('I. P. Gent'), arxiv.Resu... | \n",
- " ['cs.AI'] | \n",
- " See http://www.jair.org/ for any accompanying ... | \n",
- " NaN | \n",
- " http://arxiv.org/abs/cs/9309101v1 | \n",
- " Journal of Artificial Intelligence Research, V... | \n",
- " http://arxiv.org/pdf/cs/9309101v1 | \n",
- " cs.AI | \n",
- " 1993-09-01 00:00:00+00:00 | \n",
- " We describe an extensive study of search in GS... | \n",
- " An Empirical Analysis of Search in GSAT | \n",
- " 1993-09-01 00:00:00+00:00 | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " [arxiv.Result.Author('F. Bergadano'), arxiv.Re... | \n",
- " ['cs.AI'] | \n",
- " See http://www.jair.org/ for any accompanying ... | \n",
- " NaN | \n",
- " http://arxiv.org/abs/cs/9311101v1 | \n",
- " Journal of Artificial Intelligence Research, V... | \n",
- " http://arxiv.org/pdf/cs/9311101v1 | \n",
- " cs.AI | \n",
- " 1993-11-01 00:00:00+00:00 | \n",
- " As real logic programmers normally use cut (!)... | \n",
- " The Difficulties of Learning Logic Programs wi... | \n",
- " 1993-11-01 00:00:00+00:00 | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " [arxiv.Result.Author('J. C. Schlimmer'), arxiv... | \n",
- " ['cs.AI'] | \n",
- " See http://www.jair.org/ for an online appendi... | \n",
- " NaN | \n",
- " http://arxiv.org/abs/cs/9311102v1 | \n",
- " Journal of Artificial Intelligence Research, V... | \n",
- " http://arxiv.org/pdf/cs/9311102v1 | \n",
- " cs.AI | \n",
- " 1993-11-01 00:00:00+00:00 | \n",
- " To support the goal of allowing users to recor... | \n",
- " Software Agents: Completing Patterns and Const... | \n",
- " 1993-11-01 00:00:00+00:00 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " authors categories \\\n",
- "0 [arxiv.Result.Author('M. L. Ginsberg')] ['cs.AI'] \n",
- "1 [arxiv.Result.Author('M. P. Wellman')] ['cs.AI'] \n",
- "2 [arxiv.Result.Author('I. P. Gent'), arxiv.Resu... ['cs.AI'] \n",
- "3 [arxiv.Result.Author('F. Bergadano'), arxiv.Re... ['cs.AI'] \n",
- "4 [arxiv.Result.Author('J. C. Schlimmer'), arxiv... ['cs.AI'] \n",
- "\n",
- " comment doi \\\n",
- "0 See http://www.jair.org/ for an online appendi... NaN \n",
- "1 See http://www.jair.org/ for any accompanying ... NaN \n",
- "2 See http://www.jair.org/ for any accompanying ... NaN \n",
- "3 See http://www.jair.org/ for any accompanying ... NaN \n",
- "4 See http://www.jair.org/ for an online appendi... NaN \n",
- "\n",
- " entry_id \\\n",
- "0 http://arxiv.org/abs/cs/9308101v1 \n",
- "1 http://arxiv.org/abs/cs/9308102v1 \n",
- "2 http://arxiv.org/abs/cs/9309101v1 \n",
- "3 http://arxiv.org/abs/cs/9311101v1 \n",
- "4 http://arxiv.org/abs/cs/9311102v1 \n",
- "\n",
- " journal_ref \\\n",
- "0 Journal of Artificial Intelligence Research, V... \n",
- "1 Journal of Artificial Intelligence Research, V... \n",
- "2 Journal of Artificial Intelligence Research, V... \n",
- "3 Journal of Artificial Intelligence Research, V... \n",
- "4 Journal of Artificial Intelligence Research, V... \n",
- "\n",
- " pdf_url primary_category \\\n",
- "0 http://arxiv.org/pdf/cs/9308101v1 cs.AI \n",
- "1 http://arxiv.org/pdf/cs/9308102v1 cs.AI \n",
- "2 http://arxiv.org/pdf/cs/9309101v1 cs.AI \n",
- "3 http://arxiv.org/pdf/cs/9311101v1 cs.AI \n",
- "4 http://arxiv.org/pdf/cs/9311102v1 cs.AI \n",
- "\n",
- " published \\\n",
- "0 1993-08-01 00:00:00+00:00 \n",
- "1 1993-08-01 00:00:00+00:00 \n",
- "2 1993-09-01 00:00:00+00:00 \n",
- "3 1993-11-01 00:00:00+00:00 \n",
- "4 1993-11-01 00:00:00+00:00 \n",
- "\n",
- " summary \\\n",
- "0 Because of their occasional need to return to ... \n",
- "1 Market price systems constitute a well-underst... \n",
- "2 We describe an extensive study of search in GS... \n",
- "3 As real logic programmers normally use cut (!)... \n",
- "4 To support the goal of allowing users to recor... \n",
- "\n",
- " title \\\n",
- "0 Dynamic Backtracking \n",
- "1 A Market-Oriented Programming Environment and ... \n",
- "2 An Empirical Analysis of Search in GSAT \n",
- "3 The Difficulties of Learning Logic Programs wi... \n",
- "4 Software Agents: Completing Patterns and Const... \n",
- "\n",
- " updated \n",
- "0 1993-08-01 00:00:00+00:00 \n",
- "1 1993-08-01 00:00:00+00:00 \n",
- "2 1993-09-01 00:00:00+00:00 \n",
- "3 1993-11-01 00:00:00+00:00 \n",
- "4 1993-11-01 00:00:00+00:00 "
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df = pd.read_csv('arxiv_ai_data.csv').head(100)\n",
- "df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "KDP5NiI7zsHR",
- "outputId": "8311518d-f8ea-4e1a-b367-1b42dad261d1"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Index(['authors', 'categories', 'comment', 'doi', 'entry_id', 'journal_ref',\n",
- " 'pdf_url', 'primary_category', 'published', 'summary', 'title',\n",
- " 'updated'],\n",
- " dtype='object')"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.columns"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "JbKKQ04BzsHS"
- },
- "outputs": [],
- "source": [
- "df = pd.read_csv('arxiv_ai_data.csv').head(100)\n",
- "\n",
- "# Convert to datetime but keep it as datetime (more readable and usable)\n",
- "df['published'] = pd.to_datetime(df['published'])\n",
- "\n",
- "# Ensure summary is a string\n",
- "df['summary'] = df['summary'].astype(str)\n",
- "\n",
- "# Add 'text' column for similarity search\n",
- "df['text'] = df['title'] + \" \" + df['summary']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "x43SRJszzsHT",
- "outputId": "4ec685ae-5a4c-4024-a78d-590470434750"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "5 Most Recent Papers:\n",
- "--------------------------------------------------\n",
- "Date: 2011-06-30\n",
- "Title: Phase Transitions and Backbones of the Asymmetric Traveling Salesman Problem\n",
- "--------------------------------------------------\n",
- "Date: 2011-06-30\n",
- "Title: A Comprehensive Trainable Error Model for Sung Music Queries\n",
- "--------------------------------------------------\n",
- "Date: 2011-06-30\n",
- "Title: Finding Approximate POMDP solutions Through Belief Compression\n",
- "--------------------------------------------------\n",
- "Date: 2011-06-30\n",
- "Title: Ordered Landmarks in Planning\n",
- "--------------------------------------------------\n",
- "Date: 2011-06-30\n",
- "Title: On Prediction Using Variable Order Markov Models\n",
- "--------------------------------------------------\n",
- "\n",
- "5 Oldest Papers:\n",
- "--------------------------------------------------\n",
- "Date: 1993-11-01\n",
- "Title: Software Agents: Completing Patterns and Constructing User Interfaces\n",
- "--------------------------------------------------\n",
- "Date: 1993-11-01\n",
- "Title: The Difficulties of Learning Logic Programs with Cut\n",
- "--------------------------------------------------\n",
- "Date: 1993-09-01\n",
- "Title: An Empirical Analysis of Search in GSAT\n",
- "--------------------------------------------------\n",
- "Date: 1993-08-01\n",
- "Title: A Market-Oriented Programming Environment and its Application to Distributed Multicommodity Flow Problems\n",
- "--------------------------------------------------\n",
- "Date: 1993-08-01\n",
- "Title: Dynamic Backtracking\n",
- "--------------------------------------------------\n"
- ]
- }
- ],
- "source": [
- "# Sort the DataFrame by published date in descending order (newest first)\n",
- "df_sorted = df.sort_values(by='published', ascending=False)\n",
- "\n",
- "# Display the 5 most recent papers\n",
- "print(\"5 Most Recent Papers:\")\n",
- "print(\"-\" * 50)\n",
- "for idx, row in df_sorted.head().iterrows():\n",
- " print(f\"Date: {row['published'].strftime('%Y-%m-%d')}\")\n",
- " print(f\"Title: {row['title']}\")\n",
- " print(\"-\" * 50)\n",
- "\n",
- "# Optional: Display the oldest papers too\n",
- "print(\"\\n5 Oldest Papers:\")\n",
- "print(\"-\" * 50)\n",
- "for idx, row in df_sorted.tail().iterrows():\n",
- " print(f\"Date: {row['published'].strftime('%Y-%m-%d')}\")\n",
- " print(f\"Title: {row['title']}\")\n",
- " print(\"-\" * 50)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "4E41H7swzsHT"
- },
- "source": [
- "## Understanding Time-Aware Paper Discovery with RecencySpace\n",
- "\n",
- "When searching through research papers, finding relevant content is only half the story - knowing when it was published can be just as important. That's why we've implemented a smart time-aware search system using RecencySpace. Imagine you're organizing your digital library: you want to find papers not just by what they're about, but also by when they were written. Our system does exactly that, but automatically and intelligently.\n",
- "\n",
- "We've set up our search to look at both what a paper contains and when it was published. Using different time windows (like papers from the last few months, the last year, or even older), our system can prioritize papers based on their publication dates while still keeping track of their relevance to your search. It's like having a research assistant who knows exactly how to balance the importance of time and content in your search results.\n",
- "\n",
- "For our collection of AI research papers this time-aware approach helps us understand how ideas evolved during these foundational years. When you search for topics like \"quantum computing\" or \"neural networks,\" the system doesn't just find papers containing these terms - it also considers their place in the timeline of AI development. This makes it easier to trace how concepts developed and changed over time, giving us a clearer picture of AI's historical progression."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "YUW_nhL2zsHT"
- },
- "outputs": [],
- "source": [
- "class PaperSchema(sl.Schema):\n",
- " text: sl.String\n",
- " published: sl.Timestamp # This will handle datetime objects properly\n",
- " entry_id: sl.IdField\n",
- " title: sl.String\n",
- " summary: sl.String\n",
- "\n",
- "paper = PaperSchema()\n",
- "\n",
- "# Define spaces\n",
- "text_space = sl.TextSimilaritySpace(\n",
- " text=sl.chunk(paper.text, chunk_size=200, chunk_overlap=50),\n",
- " model=\"sentence-transformers/all-mpnet-base-v2\"\n",
- ")\n",
- "recency_space = sl.RecencySpace(\n",
- " timestamp=paper.published,\n",
- " period_time_list=[\n",
- " sl.PeriodTime(timedelta(days=365)), # papers within 1 year\n",
- " sl.PeriodTime(timedelta(days=2*365)), # papers within 2 years\n",
- " sl.PeriodTime(timedelta(days=3*365)), # papers within 3 years\n",
- " ],\n",
- " negative_filter=-0.25\n",
- ")"
- ]
- },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "imQS4g52zsHM"
+ },
+ "source": [
+ "### Installing required packages"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "jrWsiNN_zsHO"
+ },
+ "outputs": [],
+ "source": [
+ "%%capture\n",
+ "!pip3 install openai pandas sentence-transformers transformers superlinked==19.21.1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "qWYevol5zsHO"
+ },
+ "source": "### Setting up the Imports and LLM Provider Configuration\n\nThis notebook supports two LLM providers out of the box:\n\n| Provider | Default | How to enable |\n|----------|---------|---------------|\n| **OpenAI** | Yes | Set `OPENAI_API_KEY` |\n| **MiniMax** | No | Set `LLM_PROVIDER=minimax` and `MINIMAX_API_KEY` |\n\n[MiniMax](https://www.minimaxi.com/) exposes an OpenAI-compatible API, so the same OpenAI SDK is used — only the `base_url` and API key change. The default model is `MiniMax-M2.7` (204K-token context window)."
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "idNHQDiGzsHP",
+ "outputId": "d5b4f221-ab00-44db-d44f-cbcd34220f62"
+ },
+ "outputs": [],
+ "source": "import pandas as pd\nimport superlinked.framework as sl\nfrom datetime import timedelta\nfrom sentence_transformers import SentenceTransformer\nfrom openai import OpenAI\nimport os\nfrom abc import ABC, abstractmethod\nfrom typing import Any, Optional, Dict\nfrom tqdm import tqdm\n\ntry:\n from google.colab import userdata\n IN_COLAB = True\nexcept ImportError:\n IN_COLAB = False\n\n\n# Abstract Tool Class\nclass Tool(ABC):\n @abstractmethod\n def name(self) -> str:\n pass\n\n @abstractmethod\n def description(self) -> str:\n pass\n\n @abstractmethod\n def use(self, *args, **kwargs) -> Any:\n pass\n\n\ndef _get_secret(key: str) -> str:\n \"\"\"Read an API key from Colab user secrets or the environment.\"\"\"\n if IN_COLAB:\n try:\n return userdata.get(key)\n except Exception:\n pass\n return os.environ.get(key, \"\")\n\n\n# ---------------------------------------------------------------------------\n# LLM provider configuration\n# Set LLM_PROVIDER=minimax to use MiniMax instead of OpenAI.\n# MiniMax provides an OpenAI-compatible API, so the same OpenAI SDK works —\n# only the base URL and API key differ.\n#\n# OpenAI: (default)\n# OPENAI_API_KEY=sk-...\n#\n# MiniMax: https://www.minimaxi.com/\n# LLM_PROVIDER=minimax\n# MINIMAX_API_KEY=...\n# ---------------------------------------------------------------------------\nPROVIDER = os.environ.get(\"LLM_PROVIDER\", \"openai\")\n\nif PROVIDER == \"minimax\":\n api_key = _get_secret(\"MINIMAX_API_KEY\")\n if not api_key:\n raise ValueError(\n \"MINIMAX_API_KEY not found. Add it to Colab secrets or set the environment variable.\\n\"\n \"Get your key at https://www.minimaxi.com/\"\n )\n client = OpenAI(api_key=api_key, base_url=\"https://api.minimax.io/v1\")\n model = \"MiniMax-M2.7\"\n print(f\"Using MiniMax provider (model: {model}, context: 204K tokens)\")\nelse:\n api_key = _get_secret(\"OPENAI_API_KEY\")\n if not api_key:\n raise ValueError(\n \"OPENAI_API_KEY not found. Please add it using Tools > User secrets in Colab.\"\n )\n client = OpenAI(api_key=api_key)\n model = \"gpt-4\"\n print(f\"Using OpenAI provider (model: {model})\")"
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "b5jxe_IAzsHQ"
+ },
+ "source": [
+ "### Downloading the relevant data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "72bCIO_uzsHQ",
+ "outputId": "0d410eaf-d7c9-48f3-d786-69231a9277c8"
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "id": "05uWM0XizsHU"
- },
- "source": [
- "### Creating the index and ingesting the relevant data. we are using the in-memory superlinked executor to ingest the data\n"
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "--2025-03-13 22:39:51-- https://drive.google.com/uc?export=download&id=1FCR3TW5yLjGhEmm-Uclw0_5PWVEaLk1j\n",
+ "Resolving drive.google.com (drive.google.com)... 142.250.207.238\n",
+ "Connecting to drive.google.com (drive.google.com)|142.250.207.238|:443... connected.\n",
+ "HTTP request sent, awaiting response... 303 See Other\n",
+ "Location: https://drive.usercontent.google.com/download?id=1FCR3TW5yLjGhEmm-Uclw0_5PWVEaLk1j&export=download [following]\n",
+ "--2025-03-13 22:39:52-- https://drive.usercontent.google.com/download?id=1FCR3TW5yLjGhEmm-Uclw0_5PWVEaLk1j&export=download\n",
+ "Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 142.250.193.225\n",
+ "Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|142.250.193.225|:443... connected.\n",
+ "HTTP request sent, awaiting response... 200 OK\n",
+ "Length: 13941471 (13M) [application/octet-stream]\n",
+ "Saving to: ‘arxiv_ai_data.csv’\n",
+ "\n",
+ "arxiv_ai_data.csv 100%[===================>] 13.29M 1.90MB/s in 7.3s \n",
+ "\n",
+ "2025-03-13 22:40:13 (1.81 MB/s) - ‘arxiv_ai_data.csv’ saved [13941471/13941471]\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1FCR3TW5yLjGhEmm-Uclw0_5PWVEaLk1j' -O arxiv_ai_data.csv"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### Data Loading and Truncation\n",
+ "\n",
+ "To improve loading times, especially for users on the free Colab tier, the dataset is truncated to 100 rows by default. This means that only the first 100 entries from the original CSV file are loaded initially. You can change it to use the full dataset if you want!"
+ ],
+ "metadata": {
+ "id": "vSEKGNESfakX"
+ }
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "vtgyd3xrzsHQ",
+ "outputId": "ecf8a1aa-5ad1-48a9-a506-8f6b412cd547"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "source": [
- "# Create the index\n",
- "paper_index = sl.Index([text_space, recency_space])\n",
- "\n",
- "# Parser to map DataFrame columns to schema fields\n",
- "parser = sl.DataFrameParser(\n",
- " paper,\n",
- " mapping={\n",
- " paper.entry_id: \"entry_id\",\n",
- " paper.published: \"published\",\n",
- " paper.text: \"text\",\n",
- " paper.title: \"title\",\n",
- " paper.summary: \"summary\",\n",
- " }\n",
- ")\n",
- "\n",
- "# Set up in-memory source and executor\n",
- "source = sl.InMemorySource(paper, parser=parser)\n",
- "executor = sl.InMemoryExecutor(sources=[source], indices=[paper_index])\n",
- "app = executor.run()\n",
- "\n",
- "# Load the DataFrame with a progress bar using batches\n",
- "batch_size = 10\n",
- "data_batches = [df[i:i + batch_size] for i in range(0, len(df), batch_size)]\n",
- "for batch in tqdm(data_batches, total=len(data_batches), desc=\"Loading Data into Source\"):\n",
- " source.put([batch])"
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " authors | \n",
+ " categories | \n",
+ " comment | \n",
+ " doi | \n",
+ " entry_id | \n",
+ " journal_ref | \n",
+ " pdf_url | \n",
+ " primary_category | \n",
+ " published | \n",
+ " summary | \n",
+ " title | \n",
+ " updated | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " [arxiv.Result.Author('M. L. Ginsberg')] | \n",
+ " ['cs.AI'] | \n",
+ " See http://www.jair.org/ for an online appendi... | \n",
+ " NaN | \n",
+ " http://arxiv.org/abs/cs/9308101v1 | \n",
+ " Journal of Artificial Intelligence Research, V... | \n",
+ " http://arxiv.org/pdf/cs/9308101v1 | \n",
+ " cs.AI | \n",
+ " 1993-08-01 00:00:00+00:00 | \n",
+ " Because of their occasional need to return to ... | \n",
+ " Dynamic Backtracking | \n",
+ " 1993-08-01 00:00:00+00:00 | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " [arxiv.Result.Author('M. P. Wellman')] | \n",
+ " ['cs.AI'] | \n",
+ " See http://www.jair.org/ for any accompanying ... | \n",
+ " NaN | \n",
+ " http://arxiv.org/abs/cs/9308102v1 | \n",
+ " Journal of Artificial Intelligence Research, V... | \n",
+ " http://arxiv.org/pdf/cs/9308102v1 | \n",
+ " cs.AI | \n",
+ " 1993-08-01 00:00:00+00:00 | \n",
+ " Market price systems constitute a well-underst... | \n",
+ " A Market-Oriented Programming Environment and ... | \n",
+ " 1993-08-01 00:00:00+00:00 | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " [arxiv.Result.Author('I. P. Gent'), arxiv.Resu... | \n",
+ " ['cs.AI'] | \n",
+ " See http://www.jair.org/ for any accompanying ... | \n",
+ " NaN | \n",
+ " http://arxiv.org/abs/cs/9309101v1 | \n",
+ " Journal of Artificial Intelligence Research, V... | \n",
+ " http://arxiv.org/pdf/cs/9309101v1 | \n",
+ " cs.AI | \n",
+ " 1993-09-01 00:00:00+00:00 | \n",
+ " We describe an extensive study of search in GS... | \n",
+ " An Empirical Analysis of Search in GSAT | \n",
+ " 1993-09-01 00:00:00+00:00 | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " [arxiv.Result.Author('F. Bergadano'), arxiv.Re... | \n",
+ " ['cs.AI'] | \n",
+ " See http://www.jair.org/ for any accompanying ... | \n",
+ " NaN | \n",
+ " http://arxiv.org/abs/cs/9311101v1 | \n",
+ " Journal of Artificial Intelligence Research, V... | \n",
+ " http://arxiv.org/pdf/cs/9311101v1 | \n",
+ " cs.AI | \n",
+ " 1993-11-01 00:00:00+00:00 | \n",
+ " As real logic programmers normally use cut (!)... | \n",
+ " The Difficulties of Learning Logic Programs wi... | \n",
+ " 1993-11-01 00:00:00+00:00 | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " [arxiv.Result.Author('J. C. Schlimmer'), arxiv... | \n",
+ " ['cs.AI'] | \n",
+ " See http://www.jair.org/ for an online appendi... | \n",
+ " NaN | \n",
+ " http://arxiv.org/abs/cs/9311102v1 | \n",
+ " Journal of Artificial Intelligence Research, V... | \n",
+ " http://arxiv.org/pdf/cs/9311102v1 | \n",
+ " cs.AI | \n",
+ " 1993-11-01 00:00:00+00:00 | \n",
+ " To support the goal of allowing users to recor... | \n",
+ " Software Agents: Completing Patterns and Const... | \n",
+ " 1993-11-01 00:00:00+00:00 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
],
- "metadata": {
- "id": "gomth7UxnxUv"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "UvRq6cWdzsHU"
- },
- "source": [
- "### Query definition"
+ "text/plain": [
+ " authors categories \\\n",
+ "0 [arxiv.Result.Author('M. L. Ginsberg')] ['cs.AI'] \n",
+ "1 [arxiv.Result.Author('M. P. Wellman')] ['cs.AI'] \n",
+ "2 [arxiv.Result.Author('I. P. Gent'), arxiv.Resu... ['cs.AI'] \n",
+ "3 [arxiv.Result.Author('F. Bergadano'), arxiv.Re... ['cs.AI'] \n",
+ "4 [arxiv.Result.Author('J. C. Schlimmer'), arxiv... ['cs.AI'] \n",
+ "\n",
+ " comment doi \\\n",
+ "0 See http://www.jair.org/ for an online appendi... NaN \n",
+ "1 See http://www.jair.org/ for any accompanying ... NaN \n",
+ "2 See http://www.jair.org/ for any accompanying ... NaN \n",
+ "3 See http://www.jair.org/ for any accompanying ... NaN \n",
+ "4 See http://www.jair.org/ for an online appendi... NaN \n",
+ "\n",
+ " entry_id \\\n",
+ "0 http://arxiv.org/abs/cs/9308101v1 \n",
+ "1 http://arxiv.org/abs/cs/9308102v1 \n",
+ "2 http://arxiv.org/abs/cs/9309101v1 \n",
+ "3 http://arxiv.org/abs/cs/9311101v1 \n",
+ "4 http://arxiv.org/abs/cs/9311102v1 \n",
+ "\n",
+ " journal_ref \\\n",
+ "0 Journal of Artificial Intelligence Research, V... \n",
+ "1 Journal of Artificial Intelligence Research, V... \n",
+ "2 Journal of Artificial Intelligence Research, V... \n",
+ "3 Journal of Artificial Intelligence Research, V... \n",
+ "4 Journal of Artificial Intelligence Research, V... \n",
+ "\n",
+ " pdf_url primary_category \\\n",
+ "0 http://arxiv.org/pdf/cs/9308101v1 cs.AI \n",
+ "1 http://arxiv.org/pdf/cs/9308102v1 cs.AI \n",
+ "2 http://arxiv.org/pdf/cs/9309101v1 cs.AI \n",
+ "3 http://arxiv.org/pdf/cs/9311101v1 cs.AI \n",
+ "4 http://arxiv.org/pdf/cs/9311102v1 cs.AI \n",
+ "\n",
+ " published \\\n",
+ "0 1993-08-01 00:00:00+00:00 \n",
+ "1 1993-08-01 00:00:00+00:00 \n",
+ "2 1993-09-01 00:00:00+00:00 \n",
+ "3 1993-11-01 00:00:00+00:00 \n",
+ "4 1993-11-01 00:00:00+00:00 \n",
+ "\n",
+ " summary \\\n",
+ "0 Because of their occasional need to return to ... \n",
+ "1 Market price systems constitute a well-underst... \n",
+ "2 We describe an extensive study of search in GS... \n",
+ "3 As real logic programmers normally use cut (!)... \n",
+ "4 To support the goal of allowing users to recor... \n",
+ "\n",
+ " title \\\n",
+ "0 Dynamic Backtracking \n",
+ "1 A Market-Oriented Programming Environment and ... \n",
+ "2 An Empirical Analysis of Search in GSAT \n",
+ "3 The Difficulties of Learning Logic Programs wi... \n",
+ "4 Software Agents: Completing Patterns and Const... \n",
+ "\n",
+ " updated \n",
+ "0 1993-08-01 00:00:00+00:00 \n",
+ "1 1993-08-01 00:00:00+00:00 \n",
+ "2 1993-09-01 00:00:00+00:00 \n",
+ "3 1993-11-01 00:00:00+00:00 \n",
+ "4 1993-11-01 00:00:00+00:00 "
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "MNsIF98ezsHV"
- },
- "outputs": [],
- "source": [
- "# Define the query\n",
- "knowledgebase_query = (\n",
- " sl.Query(\n",
- " paper_index,\n",
- " weights={\n",
- " text_space: sl.Param(\"relevance_weight\"),\n",
- " recency_space: sl.Param(\"recency_weight\"),\n",
- " }\n",
- " )\n",
- " .find(paper)\n",
- " .similar(text_space, sl.Param(\"search_query\"))\n",
- " .select(paper.entry_id, paper.published, paper.text, paper.title, paper.summary)\n",
- " .limit(sl.Param(\"limit\"))\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "rgBJz6avzsHV"
- },
- "source": [
- "### Defining the tools for the kernel agent"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "vIBL9FuCzsHV"
- },
- "source": [
- "### Retrieval Tool"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "qHjKsMjDzsHV"
- },
- "outputs": [],
- "source": [
- "class RetrievalTool(Tool):\n",
- " def __init__(self, df, app, knowledgebase_query, client, model):\n",
- " self.df = df\n",
- " self.app = app\n",
- " self.knowledgebase_query = knowledgebase_query\n",
- " self.client = client\n",
- " self.model = model\n",
- "\n",
- " def name(self) -> str:\n",
- " return \"RetrievalTool\"\n",
- "\n",
- " def description(self) -> str:\n",
- " return \"Retrieves a list of relevant papers based on a query using Superlinked.\"\n",
- "\n",
- " def use(self, query: str) -> pd.DataFrame:\n",
- " result = self.app.query(\n",
- " self.knowledgebase_query,\n",
- " relevance_weight=1.0,\n",
- " recency_weight=0.5,\n",
- " search_query=query,\n",
- " limit=5\n",
- " )\n",
- " df_result = sl.PandasConverter.to_pandas(result)\n",
- " # Ensure summary is a string\n",
- " if 'summary' in df_result.columns:\n",
- " df_result['summary'] = df_result['summary'].astype(str)\n",
- " else:\n",
- " print(\"Warning: 'summary' column not found in retrieved DataFrame.\")\n",
- " return df_result"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "krt1ZDbqzsHW"
- },
- "source": [
- "### Summarization Tool"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "91GnOxVazsHW"
- },
- "outputs": [],
- "source": [
- "class SummarizationTool(Tool):\n",
- " def __init__(self, df, client, model):\n",
- " self.df = df\n",
- " self.client = client\n",
- " self.model = model\n",
- "\n",
- " def name(self) -> str:\n",
- " return \"SummarizationTool\"\n",
- "\n",
- " def description(self) -> str:\n",
- " return \"Generates a concise summary of specified papers using an LLM.\"\n",
- "\n",
- " def use(self, query: str, paper_ids: list) -> str:\n",
- " papers = self.df[self.df['entry_id'].isin(paper_ids)]\n",
- " if papers.empty:\n",
- " return \"No papers found with the given IDs.\"\n",
- " summaries = papers['summary'].tolist()\n",
- " summary_str = \"\\n\\n\".join(summaries)\n",
- " prompt = f\"\"\"\n",
- " Summarize the following paper summaries:\\n\\n{summary_str}\\n\\nProvide a concise summary.\n",
- " \"\"\"\n",
- " response = self.client.chat.completions.create(\n",
- " model=self.model,\n",
- " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
- " temperature=0.7,\n",
- " max_tokens=500\n",
- " )\n",
- " return response.choices[0].message.content.strip()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "jtFVy9LAzsHW"
- },
- "source": [
- "### Question Answer Tool"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "yOOH3LYkzsHW"
- },
- "outputs": [],
- "source": [
- "class QuestionAnsweringTool(Tool):\n",
- " def __init__(self, retrieval_tool, client, model):\n",
- " self.retrieval_tool = retrieval_tool\n",
- " self.client = client\n",
- " self.model = model\n",
- "\n",
- " def name(self) -> str:\n",
- " return \"QuestionAnsweringTool\"\n",
- "\n",
- " def description(self) -> str:\n",
- " return \"Answers questions about research topics using retrieved paper summaries or general knowledge if no specific context is available.\"\n",
- "\n",
- " def use(self, query: str) -> str:\n",
- " df_result = self.retrieval_tool.use(query)\n",
- " if 'summary' not in df_result.columns:\n",
- " # Tag as a general question if summary is missing\n",
- " prompt = f\"\"\"\n",
- " You are a knowledgeable research assistant. This is a general question tagged as [GENERAL]. Answer based on your broad knowledge, not limited to specific paper summaries. If you don't know the answer, provide a brief explanation of why.\n",
- "\n",
- " User's question: {query}\n",
- " \"\"\"\n",
- " else:\n",
- " # Use paper summaries for specific context\n",
- " contexts = df_result['summary'].tolist()\n",
- " context_str = \"\\n\\n\".join(contexts)\n",
- " prompt = f\"\"\"\n",
- " You are a research assistant. Use the following paper summaries to answer the user's question. If you don't know the answer based on the summaries, say 'I don't know.'\n",
- "\n",
- " Paper summaries:\n",
- " {context_str}\n",
- "\n",
- " User's question: {query}\n",
- " \"\"\"\n",
- " response = self.client.chat.completions.create(\n",
- " model=self.model,\n",
- " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
- " temperature=0.7,\n",
- " max_tokens=500\n",
- " )\n",
- " return response.choices[0].message.content.strip()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "ivbNnN7HzsHW"
- },
- "source": [
- "### Setting up the kernel agent"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "_044s1W_zsHX"
- },
- "outputs": [],
- "source": [
- "class KernelAgent:\n",
- " def __init__(self, retrieval_tool: RetrievalTool, summarization_tool: SummarizationTool, question_answering_tool: QuestionAnsweringTool, client, model):\n",
- " self.retrieval_tool = retrieval_tool\n",
- " self.summarization_tool = summarization_tool\n",
- " self.question_answering_tool = question_answering_tool\n",
- " self.client = client\n",
- " self.model = model\n",
- "\n",
- " def classify_query(self, query: str) -> str:\n",
- " prompt = f\"\"\"\n",
- " Classify the following user prompt into one of the three categories:\n",
- " - retrieval: The user wants to find a list of papers based on some criteria (e.g., 'Find papers on AI ethics from 2020').\n",
- " - summarization: The user wants to summarize a list of papers (e.g., 'Summarize papers with entry_id 123, 456, 789').\n",
- " - question_answering: The user wants to ask a question about research topics and get an answer (e.g., 'What is the latest development in AI ethics?').\n",
- "\n",
- " User prompt: {query}\n",
- "\n",
- " Respond with only the category name (retrieval, summarization, question_answering).\n",
- " If unsure, respond with 'unknown'.\n",
- " \"\"\"\n",
- " response = self.client.chat.completions.create(\n",
- " model=self.model,\n",
- " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
- " temperature=0.7,\n",
- " max_tokens=10\n",
- " )\n",
- " classification = response.choices[0].message.content.strip().lower()\n",
- " print(f\"Query type: {classification}\")\n",
- " return classification\n",
- "\n",
- " def process_query(self, query: str, params: Optional[Dict] = None) -> str:\n",
- " query_type = self.classify_query(query)\n",
- " if query_type == 'retrieval':\n",
- " df_result = self.retrieval_tool.use(query)\n",
- " response = \"Here are the top papers:\\n\"\n",
- " for i, row in df_result.iterrows():\n",
- " # Ensure summary is a string and handle empty cases\n",
- " summary = str(row['summary']) if pd.notna(row['summary']) else \"\"\n",
- " response += f\"{i+1}. {row['title']} \\nSummary: {summary[:200]}...\\n\\n\"\n",
- " return response\n",
- " elif query_type == 'summarization':\n",
- " if not params or 'paper_ids' not in params:\n",
- " return \"Error: Summarization query requires a 'paper_ids' parameter with a list of entry_ids.\"\n",
- " return self.summarization_tool.use(query, params['paper_ids'])\n",
- " elif query_type == 'question_answering':\n",
- " return self.question_answering_tool.use(query)\n",
- " else:\n",
- " return \"Error: Unable to classify query as 'retrieval', 'summarization', or 'question_answering'.\""
- ]
- },
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.read_csv('arxiv_ai_data.csv').head(100)\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "KDP5NiI7zsHR",
+ "outputId": "8311518d-f8ea-4e1a-b367-1b42dad261d1"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "kMAd4Gl9zsHX"
- },
- "outputs": [],
- "source": [
- "retrieval_tool = RetrievalTool(df, app, knowledgebase_query, client, model)\n",
- "summarization_tool = SummarizationTool(df, client, model)\n",
- "question_answering_tool = QuestionAnsweringTool(retrieval_tool, client, model)\n",
- "\n",
- "# Initialize KernelAgent\n",
- "kernel_agent = KernelAgent(retrieval_tool, summarization_tool, question_answering_tool, client, model)"
+ "data": {
+ "text/plain": [
+ "Index(['authors', 'categories', 'comment', 'doi', 'entry_id', 'journal_ref',\n",
+ " 'pdf_url', 'primary_category', 'published', 'summary', 'title',\n",
+ " 'updated'],\n",
+ " dtype='object')"
]
- },
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.columns"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "JbKKQ04BzsHS"
+ },
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv('arxiv_ai_data.csv').head(100)\n",
+ "\n",
+ "# Convert to datetime but keep it as datetime (more readable and usable)\n",
+ "df['published'] = pd.to_datetime(df['published'])\n",
+ "\n",
+ "# Ensure summary is a string\n",
+ "df['summary'] = df['summary'].astype(str)\n",
+ "\n",
+ "# Add 'text' column for similarity search\n",
+ "df['text'] = df['title'] + \" \" + df['summary']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "x43SRJszzsHT",
+ "outputId": "4ec685ae-5a4c-4024-a78d-590470434750"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "fZQeakDIzsHX",
- "outputId": "5ed2567d-8a73-436f-8ea3-8273f68e3bc0"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Query type: retrieval\n",
- "Here are the top papers:\n",
- "1. Quantum Computing and Phase Transitions in Combinatorial Search \n",
- "Summary: We introduce an algorithm for combinatorial search on quantum computers that\n",
- "is capable of significantly concentrating amplitude into solutions for some NP\n",
- "search problems, on average. This is done by...\n",
- "\n",
- "2. The Road to Quantum Artificial Intelligence \n",
- "Summary: This paper overviews the basic principles and recent advances in the emerging\n",
- "field of Quantum Computation (QC), highlighting its potential application to\n",
- "Artificial Intelligence (AI). The paper provi...\n",
- "\n",
- "3. Solving Highly Constrained Search Problems with Quantum Computers \n",
- "Summary: A previously developed quantum search algorithm for solving 1-SAT problems in\n",
- "a single step is generalized to apply to a range of highly constrained k-SAT\n",
- "problems. We identify a bound on the number o...\n",
- "\n",
- "4. The model of quantum evolution \n",
- "Summary: This paper has been withdrawn by the author due to extremely unscientific\n",
- "errors....\n",
- "\n",
- "5. Artificial and Biological Intelligence \n",
- "Summary: This article considers evidence from physical and biological sciences to show\n",
- "machines are deficient compared to biological systems at incorporating\n",
- "intelligence. Machines fall short on two counts: fi...\n",
- "\n",
- "\n"
- ]
- }
- ],
- "source": [
- "# Test query\n",
- "print(kernel_agent.process_query(\"Find papers on quantum computing in last 10 years\"))"
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "5 Most Recent Papers:\n",
+ "--------------------------------------------------\n",
+ "Date: 2011-06-30\n",
+ "Title: Phase Transitions and Backbones of the Asymmetric Traveling Salesman Problem\n",
+ "--------------------------------------------------\n",
+ "Date: 2011-06-30\n",
+ "Title: A Comprehensive Trainable Error Model for Sung Music Queries\n",
+ "--------------------------------------------------\n",
+ "Date: 2011-06-30\n",
+ "Title: Finding Approximate POMDP solutions Through Belief Compression\n",
+ "--------------------------------------------------\n",
+ "Date: 2011-06-30\n",
+ "Title: Ordered Landmarks in Planning\n",
+ "--------------------------------------------------\n",
+ "Date: 2011-06-30\n",
+ "Title: On Prediction Using Variable Order Markov Models\n",
+ "--------------------------------------------------\n",
+ "\n",
+ "5 Oldest Papers:\n",
+ "--------------------------------------------------\n",
+ "Date: 1993-11-01\n",
+ "Title: Software Agents: Completing Patterns and Constructing User Interfaces\n",
+ "--------------------------------------------------\n",
+ "Date: 1993-11-01\n",
+ "Title: The Difficulties of Learning Logic Programs with Cut\n",
+ "--------------------------------------------------\n",
+ "Date: 1993-09-01\n",
+ "Title: An Empirical Analysis of Search in GSAT\n",
+ "--------------------------------------------------\n",
+ "Date: 1993-08-01\n",
+ "Title: A Market-Oriented Programming Environment and its Application to Distributed Multicommodity Flow Problems\n",
+ "--------------------------------------------------\n",
+ "Date: 1993-08-01\n",
+ "Title: Dynamic Backtracking\n",
+ "--------------------------------------------------\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Sort the DataFrame by published date in descending order (newest first)\n",
+ "df_sorted = df.sort_values(by='published', ascending=False)\n",
+ "\n",
+ "# Display the 5 most recent papers\n",
+ "print(\"5 Most Recent Papers:\")\n",
+ "print(\"-\" * 50)\n",
+ "for idx, row in df_sorted.head().iterrows():\n",
+ " print(f\"Date: {row['published'].strftime('%Y-%m-%d')}\")\n",
+ " print(f\"Title: {row['title']}\")\n",
+ " print(\"-\" * 50)\n",
+ "\n",
+ "# Optional: Display the oldest papers too\n",
+ "print(\"\\n5 Oldest Papers:\")\n",
+ "print(\"-\" * 50)\n",
+ "for idx, row in df_sorted.tail().iterrows():\n",
+ " print(f\"Date: {row['published'].strftime('%Y-%m-%d')}\")\n",
+ " print(f\"Title: {row['title']}\")\n",
+ " print(\"-\" * 50)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4E41H7swzsHT"
+ },
+ "source": [
+ "## Understanding Time-Aware Paper Discovery with RecencySpace\n",
+ "\n",
+ "When searching through research papers, finding relevant content is only half the story - knowing when it was published can be just as important. That's why we've implemented a smart time-aware search system using RecencySpace. Imagine you're organizing your digital library: you want to find papers not just by what they're about, but also by when they were written. Our system does exactly that, but automatically and intelligently.\n",
+ "\n",
+ "We've set up our search to look at both what a paper contains and when it was published. Using different time windows (like papers from the last few months, the last year, or even older), our system can prioritize papers based on their publication dates while still keeping track of their relevance to your search. It's like having a research assistant who knows exactly how to balance the importance of time and content in your search results.\n",
+ "\n",
+ "For our collection of AI research papers this time-aware approach helps us understand how ideas evolved during these foundational years. When you search for topics like \"quantum computing\" or \"neural networks,\" the system doesn't just find papers containing these terms - it also considers their place in the timeline of AI development. This makes it easier to trace how concepts developed and changed over time, giving us a clearer picture of AI's historical progression."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "YUW_nhL2zsHT"
+ },
+ "outputs": [],
+ "source": [
+ "class PaperSchema(sl.Schema):\n",
+ " text: sl.String\n",
+ " published: sl.Timestamp # This will handle datetime objects properly\n",
+ " entry_id: sl.IdField\n",
+ " title: sl.String\n",
+ " summary: sl.String\n",
+ "\n",
+ "paper = PaperSchema()\n",
+ "\n",
+ "# Define spaces\n",
+ "text_space = sl.TextSimilaritySpace(\n",
+ " text=sl.chunk(paper.text, chunk_size=200, chunk_overlap=50),\n",
+ " model=\"sentence-transformers/all-mpnet-base-v2\"\n",
+ ")\n",
+ "recency_space = sl.RecencySpace(\n",
+ " timestamp=paper.published,\n",
+ " period_time_list=[\n",
+ " sl.PeriodTime(timedelta(days=365)), # papers within 1 year\n",
+ " sl.PeriodTime(timedelta(days=2*365)), # papers within 2 years\n",
+ " sl.PeriodTime(timedelta(days=3*365)), # papers within 3 years\n",
+ " ],\n",
+ " negative_filter=-0.25\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "05uWM0XizsHU"
+ },
+ "source": [
+ "### Creating the index and ingesting the relevant data. we are using the in-memory superlinked executor to ingest the data\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Create the index\n",
+ "paper_index = sl.Index([text_space, recency_space])\n",
+ "\n",
+ "# Parser to map DataFrame columns to schema fields\n",
+ "parser = sl.DataFrameParser(\n",
+ " paper,\n",
+ " mapping={\n",
+ " paper.entry_id: \"entry_id\",\n",
+ " paper.published: \"published\",\n",
+ " paper.text: \"text\",\n",
+ " paper.title: \"title\",\n",
+ " paper.summary: \"summary\",\n",
+ " }\n",
+ ")\n",
+ "\n",
+ "# Set up in-memory source and executor\n",
+ "source = sl.InMemorySource(paper, parser=parser)\n",
+ "executor = sl.InMemoryExecutor(sources=[source], indices=[paper_index])\n",
+ "app = executor.run()\n",
+ "\n",
+ "# Load the DataFrame with a progress bar using batches\n",
+ "batch_size = 10\n",
+ "data_batches = [df[i:i + batch_size] for i in range(0, len(df), batch_size)]\n",
+ "for batch in tqdm(data_batches, total=len(data_batches), desc=\"Loading Data into Source\"):\n",
+ " source.put([batch])"
+ ],
+ "metadata": {
+ "id": "gomth7UxnxUv"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "UvRq6cWdzsHU"
+ },
+ "source": [
+ "### Query definition"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "MNsIF98ezsHV"
+ },
+ "outputs": [],
+ "source": [
+ "# Define the query\n",
+ "knowledgebase_query = (\n",
+ " sl.Query(\n",
+ " paper_index,\n",
+ " weights={\n",
+ " text_space: sl.Param(\"relevance_weight\"),\n",
+ " recency_space: sl.Param(\"recency_weight\"),\n",
+ " }\n",
+ " )\n",
+ " .find(paper)\n",
+ " .similar(text_space, sl.Param(\"search_query\"))\n",
+ " .select(paper.entry_id, paper.published, paper.text, paper.title, paper.summary)\n",
+ " .limit(sl.Param(\"limit\"))\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "rgBJz6avzsHV"
+ },
+ "source": [
+ "### Defining the tools for the kernel agent"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "vIBL9FuCzsHV"
+ },
+ "source": [
+ "### Retrieval Tool"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "qHjKsMjDzsHV"
+ },
+ "outputs": [],
+ "source": [
+ "class RetrievalTool(Tool):\n",
+ " def __init__(self, df, app, knowledgebase_query, client, model):\n",
+ " self.df = df\n",
+ " self.app = app\n",
+ " self.knowledgebase_query = knowledgebase_query\n",
+ " self.client = client\n",
+ " self.model = model\n",
+ "\n",
+ " def name(self) -> str:\n",
+ " return \"RetrievalTool\"\n",
+ "\n",
+ " def description(self) -> str:\n",
+ " return \"Retrieves a list of relevant papers based on a query using Superlinked.\"\n",
+ "\n",
+ " def use(self, query: str) -> pd.DataFrame:\n",
+ " result = self.app.query(\n",
+ " self.knowledgebase_query,\n",
+ " relevance_weight=1.0,\n",
+ " recency_weight=0.5,\n",
+ " search_query=query,\n",
+ " limit=5\n",
+ " )\n",
+ " df_result = sl.PandasConverter.to_pandas(result)\n",
+ " # Ensure summary is a string\n",
+ " if 'summary' in df_result.columns:\n",
+ " df_result['summary'] = df_result['summary'].astype(str)\n",
+ " else:\n",
+ " print(\"Warning: 'summary' column not found in retrieved DataFrame.\")\n",
+ " return df_result"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "krt1ZDbqzsHW"
+ },
+ "source": [
+ "### Summarization Tool"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "91GnOxVazsHW"
+ },
+ "outputs": [],
+ "source": [
+ "class SummarizationTool(Tool):\n",
+ " def __init__(self, df, client, model):\n",
+ " self.df = df\n",
+ " self.client = client\n",
+ " self.model = model\n",
+ "\n",
+ " def name(self) -> str:\n",
+ " return \"SummarizationTool\"\n",
+ "\n",
+ " def description(self) -> str:\n",
+ " return \"Generates a concise summary of specified papers using an LLM.\"\n",
+ "\n",
+ " def use(self, query: str, paper_ids: list) -> str:\n",
+ " papers = self.df[self.df['entry_id'].isin(paper_ids)]\n",
+ " if papers.empty:\n",
+ " return \"No papers found with the given IDs.\"\n",
+ " summaries = papers['summary'].tolist()\n",
+ " summary_str = \"\\n\\n\".join(summaries)\n",
+ " prompt = f\"\"\"\n",
+ " Summarize the following paper summaries:\\n\\n{summary_str}\\n\\nProvide a concise summary.\n",
+ " \"\"\"\n",
+ " response = self.client.chat.completions.create(\n",
+ " model=self.model,\n",
+ " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
+ " temperature=0.7,\n",
+ " max_tokens=500\n",
+ " )\n",
+ " return response.choices[0].message.content.strip()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "jtFVy9LAzsHW"
+ },
+ "source": [
+ "### Question Answer Tool"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "yOOH3LYkzsHW"
+ },
+ "outputs": [],
+ "source": [
+ "class QuestionAnsweringTool(Tool):\n",
+ " def __init__(self, retrieval_tool, client, model):\n",
+ " self.retrieval_tool = retrieval_tool\n",
+ " self.client = client\n",
+ " self.model = model\n",
+ "\n",
+ " def name(self) -> str:\n",
+ " return \"QuestionAnsweringTool\"\n",
+ "\n",
+ " def description(self) -> str:\n",
+ " return \"Answers questions about research topics using retrieved paper summaries or general knowledge if no specific context is available.\"\n",
+ "\n",
+ " def use(self, query: str) -> str:\n",
+ " df_result = self.retrieval_tool.use(query)\n",
+ " if 'summary' not in df_result.columns:\n",
+ " # Tag as a general question if summary is missing\n",
+ " prompt = f\"\"\"\n",
+ " You are a knowledgeable research assistant. This is a general question tagged as [GENERAL]. Answer based on your broad knowledge, not limited to specific paper summaries. If you don't know the answer, provide a brief explanation of why.\n",
+ "\n",
+ " User's question: {query}\n",
+ " \"\"\"\n",
+ " else:\n",
+ " # Use paper summaries for specific context\n",
+ " contexts = df_result['summary'].tolist()\n",
+ " context_str = \"\\n\\n\".join(contexts)\n",
+ " prompt = f\"\"\"\n",
+ " You are a research assistant. Use the following paper summaries to answer the user's question. If you don't know the answer based on the summaries, say 'I don't know.'\n",
+ "\n",
+ " Paper summaries:\n",
+ " {context_str}\n",
+ "\n",
+ " User's question: {query}\n",
+ " \"\"\"\n",
+ " response = self.client.chat.completions.create(\n",
+ " model=self.model,\n",
+ " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
+ " temperature=0.7,\n",
+ " max_tokens=500\n",
+ " )\n",
+ " return response.choices[0].message.content.strip()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ivbNnN7HzsHW"
+ },
+ "source": [
+ "### Setting up the kernel agent"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "_044s1W_zsHX"
+ },
+ "outputs": [],
+ "source": [
+ "class KernelAgent:\n",
+ " def __init__(self, retrieval_tool: RetrievalTool, summarization_tool: SummarizationTool, question_answering_tool: QuestionAnsweringTool, client, model):\n",
+ " self.retrieval_tool = retrieval_tool\n",
+ " self.summarization_tool = summarization_tool\n",
+ " self.question_answering_tool = question_answering_tool\n",
+ " self.client = client\n",
+ " self.model = model\n",
+ "\n",
+ " def classify_query(self, query: str) -> str:\n",
+ " prompt = f\"\"\"\n",
+ " Classify the following user prompt into one of the three categories:\n",
+ " - retrieval: The user wants to find a list of papers based on some criteria (e.g., 'Find papers on AI ethics from 2020').\n",
+ " - summarization: The user wants to summarize a list of papers (e.g., 'Summarize papers with entry_id 123, 456, 789').\n",
+ " - question_answering: The user wants to ask a question about research topics and get an answer (e.g., 'What is the latest development in AI ethics?').\n",
+ "\n",
+ " User prompt: {query}\n",
+ "\n",
+ " Respond with only the category name (retrieval, summarization, question_answering).\n",
+ " If unsure, respond with 'unknown'.\n",
+ " \"\"\"\n",
+ " response = self.client.chat.completions.create(\n",
+ " model=self.model,\n",
+ " messages=[{\"role\": \"user\", \"content\": prompt}],\n",
+ " temperature=0.7,\n",
+ " max_tokens=10\n",
+ " )\n",
+ " classification = response.choices[0].message.content.strip().lower()\n",
+ " print(f\"Query type: {classification}\")\n",
+ " return classification\n",
+ "\n",
+ " def process_query(self, query: str, params: Optional[Dict] = None) -> str:\n",
+ " query_type = self.classify_query(query)\n",
+ " if query_type == 'retrieval':\n",
+ " df_result = self.retrieval_tool.use(query)\n",
+ " response = \"Here are the top papers:\\n\"\n",
+ " for i, row in df_result.iterrows():\n",
+ " # Ensure summary is a string and handle empty cases\n",
+ " summary = str(row['summary']) if pd.notna(row['summary']) else \"\"\n",
+ " response += f\"{i+1}. {row['title']} \\nSummary: {summary[:200]}...\\n\\n\"\n",
+ " return response\n",
+ " elif query_type == 'summarization':\n",
+ " if not params or 'paper_ids' not in params:\n",
+ " return \"Error: Summarization query requires a 'paper_ids' parameter with a list of entry_ids.\"\n",
+ " return self.summarization_tool.use(query, params['paper_ids'])\n",
+ " elif query_type == 'question_answering':\n",
+ " return self.question_answering_tool.use(query)\n",
+ " else:\n",
+ " return \"Error: Unable to classify query as 'retrieval', 'summarization', or 'question_answering'.\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "kMAd4Gl9zsHX"
+ },
+ "outputs": [],
+ "source": [
+ "retrieval_tool = RetrievalTool(df, app, knowledgebase_query, client, model)\n",
+ "summarization_tool = SummarizationTool(df, client, model)\n",
+ "question_answering_tool = QuestionAnsweringTool(retrieval_tool, client, model)\n",
+ "\n",
+ "# Initialize KernelAgent\n",
+ "kernel_agent = KernelAgent(retrieval_tool, summarization_tool, question_answering_tool, client, model)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "fZQeakDIzsHX",
+ "outputId": "5ed2567d-8a73-436f-8ea3-8273f68e3bc0"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "64MU5Mo4zsHY",
- "outputId": "c655a1d4-03c8-4fb9-b3ea-bf90dca8042a"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Query type: summarization\n",
- "The paper discusses the challenges of learning logic programs that contain the cut predicate. It suggests a method of first generating a base program without cut and then adding cut where necessary to make it consistent with the positive examples. The paper concludes that learning programs with cut is difficult due to the need for intensional evaluation, and current induction techniques may need to be restricted to purely declarative logic languages.\n"
- ]
- }
- ],
- "source": [
- "print(kernel_agent.process_query(\"Summarize this paper\", params={\"paper_ids\": [\"http://arxiv.org/abs/cs/9311101v1\"]}))"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Query type: retrieval\n",
+ "Here are the top papers:\n",
+ "1. Quantum Computing and Phase Transitions in Combinatorial Search \n",
+ "Summary: We introduce an algorithm for combinatorial search on quantum computers that\n",
+ "is capable of significantly concentrating amplitude into solutions for some NP\n",
+ "search problems, on average. This is done by...\n",
+ "\n",
+ "2. The Road to Quantum Artificial Intelligence \n",
+ "Summary: This paper overviews the basic principles and recent advances in the emerging\n",
+ "field of Quantum Computation (QC), highlighting its potential application to\n",
+ "Artificial Intelligence (AI). The paper provi...\n",
+ "\n",
+ "3. Solving Highly Constrained Search Problems with Quantum Computers \n",
+ "Summary: A previously developed quantum search algorithm for solving 1-SAT problems in\n",
+ "a single step is generalized to apply to a range of highly constrained k-SAT\n",
+ "problems. We identify a bound on the number o...\n",
+ "\n",
+ "4. The model of quantum evolution \n",
+ "Summary: This paper has been withdrawn by the author due to extremely unscientific\n",
+ "errors....\n",
+ "\n",
+ "5. Artificial and Biological Intelligence \n",
+ "Summary: This article considers evidence from physical and biological sciences to show\n",
+ "machines are deficient compared to biological systems at incorporating\n",
+ "intelligence. Machines fall short on two counts: fi...\n",
+ "\n",
+ "\n"
+ ]
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "env",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.11.9"
- },
- "colab": {
- "provenance": []
+ ],
+ "source": [
+ "# Test query\n",
+ "print(kernel_agent.process_query(\"Find papers on quantum computing in last 10 years\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "64MU5Mo4zsHY",
+ "outputId": "c655a1d4-03c8-4fb9-b3ea-bf90dca8042a"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Query type: summarization\n",
+ "The paper discusses the challenges of learning logic programs that contain the cut predicate. It suggests a method of first generating a base program without cut and then adding cut where necessary to make it consistent with the positive examples. The paper concludes that learning programs with cut is difficult due to the need for intensional evaluation, and current induction techniques may need to be restricted to purely declarative logic languages.\n"
+ ]
}
+ ],
+ "source": [
+ "print(kernel_agent.process_query(\"Summarize this paper\", params={\"paper_ids\": [\"http://arxiv.org/abs/cs/9311101v1\"]}))"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "env",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.9"
},
- "nbformat": 4,
- "nbformat_minor": 0
+ "colab": {
+ "provenance": []
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
}
\ No newline at end of file