Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 25 additions & 34 deletions docsite/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion docsite/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@
},
"overrides": {
"svgo": "^3.3.3",
"serialize-javascript": "^7.0.3",
"serialize-javascript": "^7.0.5",
"node-forge": "^1.4.0",
"brace-expansion": "^1.1.13",
"path-to-regexp": "^0.1.13",
"picomatch": "^4.0.0",
"minimatch": "^3.1.4",
"ajv": "^8.18.0",
"qs": "^6.14.2",
Expand Down
29 changes: 22 additions & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ classifiers = [
dependencies = [
"langchain-community>=0.4.0",
"langchain-openai>=1.0.0",
"langgraph>=1.1.1,<2.0.0",
"nltk>=3.9.3",
"langgraph>=1.1.3,<2.0.0",
"nltk>=3.9.4",
"numpy<=2.3.0",
"asyncpg>=0.30.0",
"fastapi[standard]>=0.116.1",
Expand All @@ -54,10 +54,19 @@ dependencies = [
"aiofiles>=23.2.1",
"tavily-python>=0.1.11",
"pillow>=12.1.1",
"cryptography>=46.0.5",
"filelock>=3.20.3",
"PyJWT>=2.12.0",
"orjson>=3.11.6",
"cryptography>=46.0.6",
"filelock>=3.25.0",
"PyJWT>=2.12.1",
"orjson>=3.11.7",
"tornado>=6.5.3",
"protobuf>=6.33.0",
"pyasn1>=0.6.3",
"marshmallow>=3.26.2",
"pygments>=2.20.0",
"pyopenssl>=26.0.0",
"langsmith>=0.4.50",
"fonttools>=4.62.1",
"requests>=2.33.0",
]

[project.optional-dependencies]
Expand Down Expand Up @@ -99,7 +108,7 @@ oracle = [
]

streamlit = [
"streamlit==1.50.0",
"streamlit>=1.51.0",
"pyngrok==7.4.0",
"python-dotenv==1.1.1",
"xlsxwriter==3.2.9",
Expand Down Expand Up @@ -144,6 +153,12 @@ dev = [
"twine>=6.1.0",
]

[tool.uv]
override-dependencies = [
# pysonar (dev-only) pins requests==2.32.5 which conflicts; override to fix CVE
"requests>=2.33.0",
]

[tool.ruff]
src = ["src"]

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging

from langchain.schema import Document
from langchain_classic.schema import Document
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_core.tools import tool

Expand Down
16 changes: 5 additions & 11 deletions src/intugle/core/llms/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
from typing import TYPE_CHECKING, Optional

from langchain.chat_models import init_chat_model
from langchain.output_parsers import (
from langchain_classic.output_parsers import (
ResponseSchema,
RetryWithErrorOutputParser,
StructuredOutputParser,
)
from langchain.prompts import BaseChatPromptTemplate, ChatPromptTemplate
from langchain_core.prompts import BaseChatPromptTemplate, ChatPromptTemplate
from langchain_core.rate_limiters import InMemoryRateLimiter

from intugle.core import settings
Expand Down Expand Up @@ -53,9 +53,7 @@ def __init__(
self.prompt_template: BaseChatPromptTemplate = prompt_template # prompt template

self.output_parser = (
self.__output_parser_builder__(response_schemas=response_schemas)
if response_schemas is not None
else None
self.__output_parser_builder__(response_schemas=response_schemas) if response_schemas is not None else None
) # the built output parser

self.format_instructions = (
Expand All @@ -74,9 +72,7 @@ def __output_parser_builder__(self, response_schemas: list[ResponseSchema] = Non
for building the corresponding output paraser from the given ResponseSchema
"""
parser = self.parser.from_response_schemas(response_schemas=response_schemas)
retry_parser = RetryWithErrorOutputParser.from_llm(
parser=parser, llm=self.model, max_retries=self.MAX_RETRIES
)
retry_parser = RetryWithErrorOutputParser.from_llm(parser=parser, llm=self.model, max_retries=self.MAX_RETRIES)
return retry_parser

@classmethod
Expand All @@ -97,9 +93,7 @@ def invoke(self, *args, **kwargs):

sucessfull_parsing = False

prompt_value = self.llm_prompt.format_prompt(
format_instructions=self.format_instructions, **kwargs
)
prompt_value = self.llm_prompt.format_prompt(format_instructions=self.format_instructions, **kwargs)
messages = prompt_value.to_messages()
_message = messages
response = ""
Expand Down
37 changes: 21 additions & 16 deletions src/intugle/core/pipeline/business_glossary/prompts.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
from langchain.output_parsers import ResponseSchema
from langchain_classic.output_parsers import ResponseSchema

table_glossary = [ResponseSchema(name="table_glossary", description=" single-sentence business glossary definition")]
column_glossary = [ResponseSchema(name="column_glossary", description="precise, single-sentence and non-technical business glossary definition")]
column_tag_glossary = [ResponseSchema(name="column_tag_glossary", description="three precise and distinct business tags", type="list[str]")]
column_glossary = [
ResponseSchema(
name="column_glossary", description="precise, single-sentence and non-technical business glossary definition"
)
]
column_tag_glossary = [
ResponseSchema(name="column_tag_glossary", description="three precise and distinct business tags", type="list[str]")
]

BUSINESS_GLOSSARY_PROMPTS = {
"gpt-4o": {
"TABLE_GLOSSARY_TEMPLATE": """You are responsible for Data Governance in {domain},
"gpt-4o": {
"TABLE_GLOSSARY_TEMPLATE": """You are responsible for Data Governance in {domain},
generate a concise, non-technical business glossary definition for the table on a provided DDL statement.
The definition should be written as a single sentence and clearly describe the business purpose or function.\n
# Instructions
Expand All @@ -23,7 +29,7 @@
# Output
{format_instructions}
""",
"BUSINESS_GLOSSARY_TEMPLATE": """You are responsible for Data Governance in {domain},
"BUSINESS_GLOSSARY_TEMPLATE": """You are responsible for Data Governance in {domain},
generate a concise single-sentence business glossary definition for each column mentioned in the DDL statement.\n
The definition should clearly describe the business purpose or function.\n

Expand All @@ -38,7 +44,7 @@
{create_statements}\n
{format_instructions}
""",
"BUSINESS_TAGS_TEMPLATE": """You are responsible for Data Governance in {domain}, your task is to generate three business tags for a column based on the DDL statements of a table given below.
"BUSINESS_TAGS_TEMPLATE": """You are responsible for Data Governance in {domain}, your task is to generate three business tags for a column based on the DDL statements of a table given below.
Use the column's context within the DDL statement (e.g., its name, type, and table name) to infer relevant business tags. Focus on generating concise, domain-relevant,
and meaningful tags that align with the potential business use of the column.

Expand All @@ -65,10 +71,10 @@
# Additional Context:
{additional_context}\n
{format_instructions}
"""
},
"gpt-4o-mini": {
"TABLE_GLOSSARY_TEMPLATE": """
""",
},
"gpt-4o-mini": {
"TABLE_GLOSSARY_TEMPLATE": """
Role: You are responsible for Data Governance in the {domain}.\n
Task: You will be given a SQL DDL statement how `{table}` table is structured. Generate a concise, non-technical business glossary definition for `{table}` that clearly describe the business purpose or function.\n

Expand All @@ -89,7 +95,7 @@
\n\n
{format_instructions}
""",
"BUSINESS_GLOSSARY_TEMPLATE": """
"BUSINESS_GLOSSARY_TEMPLATE": """
Role: You are responsible for Data Governance in the {domain}.\n
Task: You will be given a SQL DDL statement how the attribute `{column}` is structured.\n

Expand All @@ -111,8 +117,7 @@
{additional_context}\n\n
{format_instructions}
""",
"BUSINESS_TAGS_TEMPLATE":
"""
"BUSINESS_TAGS_TEMPLATE": """
Role: You are responsible for Data Governance in the {domain}.\n
Task: You will be given a SQL DDL statement how the attribute `{column}` is structured.\n

Expand All @@ -131,6 +136,6 @@
# Additional Context:
{additional_context}\n\n
{format_instructions}
"""
""",
},
}
}
7 changes: 2 additions & 5 deletions src/intugle/core/pipeline/business_glossary/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import pandas as pd

from langchain.output_parsers import RetryOutputParser
from langchain_classic.output_parsers import RetryOutputParser
from langchain_core.prompt_values import StringPromptValue

from intugle.core import settings
Expand Down Expand Up @@ -56,11 +56,8 @@ def get_additional_context(table_name: str, global_additional_context: str = "",


def preprocess_profiling_df(profiling_data: pd.DataFrame):

profiling_data = preprocess_profiling_data(
profiling_data=profiling_data,
sample_limit=settings.STRATA_SAMPLE_LIMIT,
dtypes_to_filter=None
profiling_data=profiling_data, sample_limit=settings.STRATA_SAMPLE_LIMIT, dtypes_to_filter=None
)

return profiling_data
7 changes: 2 additions & 5 deletions src/intugle/core/pipeline/datatype_identification/l2_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import pandas as pd

from langchain.output_parsers import ResponseSchema
from langchain_classic.output_parsers import ResponseSchema
from tqdm.auto import tqdm

from intugle.core.llms.chat import ChatModelLLM
Expand Down Expand Up @@ -82,9 +82,7 @@ def __classify_dim_measure__(self, table: str, column_name: str) -> str:
def __call__(self, row) -> str:
column_name = row["column_name"]

sample_data = adjust_sample(
sample_data=row["sample_data"], expected_size=settings.L2_SAMPLE_LIMIT
)
sample_data = adjust_sample(sample_data=row["sample_data"], expected_size=settings.L2_SAMPLE_LIMIT)

table = pd.DataFrame(sample_data, columns=[column_name])

Expand All @@ -101,7 +99,6 @@ def __call__(
self,
l1_pred: pd.DataFrame,
):

l1_pred["predicted_datatype_l2"] = l1_pred.progress_apply(
self.__model,
axis=1,
Expand Down
Loading
Loading