From e40f7ed7410f5be77811857084999d7cd468cf8c Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 28 Sep 2025 15:45:40 +0000 Subject: [PATCH 01/21] Add script to estimate OFF payload sizes Co-authored-by: owner --- local/off_estimator.py | 248 ++++++++++++++++++++++++++++++++ local/off_estimator_sample.json | 18 +++ 2 files changed, 266 insertions(+) create mode 100644 local/off_estimator.py create mode 100644 local/off_estimator_sample.json diff --git a/local/off_estimator.py b/local/off_estimator.py new file mode 100644 index 0000000..60f3d77 --- /dev/null +++ b/local/off_estimator.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 + +import argparse +import gzip +import io +import json +import sys +import time +from typing import Any, Dict, Iterable, List, Optional, Tuple + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Stream OFF JSONL.gz and estimate projected payload sizes.") + source = parser.add_mutually_exclusive_group(required=True) + source.add_argument("--stdin", action="store_true", help="Read gzipped JSONL from stdin") + source.add_argument("--input", type=str, help="Path to gzipped JSONL file") + parser.add_argument("--sample", type=int, default=0, help="Number of records to process for sampling (0 = full stream)") + parser.add_argument("--output", type=str, default="", help="Optional path to write JSON report") + return parser.parse_args() + + +def iter_gz_lines_from_stdin() -> Iterable[str]: + gz = gzip.GzipFile(fileobj=sys.stdin.buffer, mode="rb") + with io.TextIOWrapper(gz, encoding="utf-8", errors="ignore", newline="\n") as f: + for line in f: + yield line + + +def iter_gz_lines_from_file(path: str) -> Iterable[str]: + with gzip.open(path, mode="rt", encoding="utf-8", errors="ignore", newline="\n") as f: + for line in f: + yield line + + +def extract_display_image_urls(selected_images: Optional[Dict[str, Any]]) -> List[Dict[str, str]]: + if not isinstance(selected_images, dict): + return [] + urls: List[Dict[str, str]] = [] + try: + for image in selected_images.values(): + display = image.get("display") if isinstance(image, dict) else None + if isinstance(display, dict): + # Prefer English if available, else any string value + if isinstance(display.get("en"), str): + urls.append({"url": display["en"]}) + else: + for v in display.values(): + if isinstance(v, str): + urls.append({"url": v}) + break + except Exception: + # Be tolerant to odd structures + return urls + return urls + + +def map_ingredient_node(node: Dict[str, Any]) -> Dict[str, Any]: + mapped: Dict[str, Any] = { + "name": node.get("text"), + "vegan": node.get("vegan"), + "vegetarian": node.get("vegetarian"), + "ingredients": [], + } + sub = node.get("ingredients") + if isinstance(sub, list) and len(sub) > 0: + mapped["ingredients"] = [map_ingredient_node(child) for child in sub if isinstance(child, dict)] + return mapped + + +def extract_projection(product: Dict[str, Any]) -> Dict[str, Any]: + # Barcode / code + barcode: Optional[str] = None + code = product.get("code") + if isinstance(code, str) and code.strip(): + barcode = code.strip() + elif isinstance(code, (int, float)): + barcode = str(code) + elif isinstance(product.get("_id"), str): + barcode = product.get("_id") + + # Brand + brand: Optional[str] = None + brand_owner = product.get("brand_owner") + if isinstance(brand_owner, str) and brand_owner.strip(): + brand = brand_owner.strip() + else: + brands = product.get("brands") + if isinstance(brands, str) and brands.strip(): + # OFF brands is comma-separated; take the first token + brand = brands.split(",")[0].strip() + + # Name + name: Optional[str] = None + product_name = product.get("product_name") + if isinstance(product_name, str) and product_name.strip(): + name = product_name.strip() + else: + # Try language-specific variants if present + for k, v in product.items(): + if k.startswith("product_name_") and isinstance(v, str) and v.strip(): + name = v.strip() + break + + # Ingredients + ingredients_list: List[Dict[str, Any]] = [] + raw_ingredients = product.get("ingredients") + if isinstance(raw_ingredients, list) and len(raw_ingredients) > 0: + ingredients_list = [map_ingredient_node(node) for node in raw_ingredients if isinstance(node, dict)] + + # Images + images: List[Dict[str, str]] = [] + selected_images = product.get("selected_images") + images = extract_display_image_urls(selected_images) + + return { + "barcode": barcode, + "brand": brand, + "name": name, + "ingredients": ingredients_list, + "images": images, + } + + +def json_bytes(value: Any) -> int: + try: + return len(json.dumps(value, ensure_ascii=False, separators=(",", ":")).encode("utf-8")) + except Exception: + return 0 + + +def utf8_bytes(value: Optional[str]) -> int: + if value is None: + return 0 + try: + return len(value.encode("utf-8")) + except Exception: + return 0 + + +def run(lines: Iterable[str], sample: int = 0) -> Dict[str, Any]: + start = time.time() + + total_records = 0 + projected_records = 0 + barcode_bytes_total = 0 + brand_bytes_total = 0 + name_bytes_total = 0 + ingredients_bytes_total = 0 + images_bytes_total = 0 + + nonempty_brand = 0 + nonempty_name = 0 + nonempty_ingredients = 0 + nonempty_images = 0 + + processed = 0 + for raw in lines: + if sample and processed >= sample: + break + total_records += 1 + raw = raw.strip() + if not raw: + continue + try: + product = json.loads(raw) + except Exception: + continue + + proj = extract_projection(product) + if proj.get("barcode"): + projected_records += 1 + barcode_bytes_total += utf8_bytes(proj.get("barcode")) + + b = proj.get("brand") + if b: + nonempty_brand += 1 + brand_bytes_total += utf8_bytes(b) + + n = proj.get("name") + if n: + nonempty_name += 1 + name_bytes_total += utf8_bytes(n) + + ing = proj.get("ingredients") + if isinstance(ing, list) and len(ing) > 0: + nonempty_ingredients += 1 + ingredients_bytes_total += json_bytes(ing) + + imgs = proj.get("images") + if isinstance(imgs, list) and len(imgs) > 0: + nonempty_images += 1 + images_bytes_total += json_bytes(imgs) + + processed += 1 + + elapsed = time.time() - start + + result = { + "total_records": total_records, + "projected_records_with_barcode": projected_records, + "barcode_bytes_total": barcode_bytes_total, + "brand_bytes_total": brand_bytes_total, + "name_bytes_total": name_bytes_total, + "ingredients_bytes_total": ingredients_bytes_total, + "images_bytes_total": images_bytes_total, + "nonempty_counts": { + "brand": nonempty_brand, + "name": nonempty_name, + "ingredients": nonempty_ingredients, + "images": nonempty_images, + }, + "elapsed_seconds": elapsed, + } + + totals_payload = ( + barcode_bytes_total + + brand_bytes_total + + name_bytes_total + + ingredients_bytes_total + + images_bytes_total + ) + result["projected_payload_bytes_total"] = totals_payload + result["avg_payload_bytes_per_projected_row"] = ( + (totals_payload / projected_records) if projected_records else 0.0 + ) + + return result + + +def main() -> None: + args = parse_args() + if args.stdin: + lines = iter_gz_lines_from_stdin() + else: + lines = iter_gz_lines_from_file(args.input) + + result = run(lines, sample=args.sample) + + if args.output: + with open(args.output, "w", encoding="utf-8") as f: + json.dump(result, f, ensure_ascii=False, indent=2) + else: + print(json.dumps(result, ensure_ascii=False, indent=2)) + + +if __name__ == "__main__": + main() + diff --git a/local/off_estimator_sample.json b/local/off_estimator_sample.json new file mode 100644 index 0000000..5e3bf48 --- /dev/null +++ b/local/off_estimator_sample.json @@ -0,0 +1,18 @@ +{ + "total_records": 100000, + "projected_records_with_barcode": 100000, + "barcode_bytes_total": 1288365, + "brand_bytes_total": 1796855, + "name_bytes_total": 2689661, + "ingredients_bytes_total": 108627317, + "images_bytes_total": 0, + "nonempty_counts": { + "brand": 98819, + "name": 99632, + "ingredients": 96406, + "images": 0 + }, + "elapsed_seconds": 53.94886898994446, + "projected_payload_bytes_total": 114402198, + "avg_payload_bytes_per_projected_row": 1144.02198 +} From a64fa1d1b26912f66f76f4f98f325b478834d5ba Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sun, 28 Sep 2025 16:26:07 +0000 Subject: [PATCH 02/21] Add off_jsonl_linecount.txt with line count Co-authored-by: owner --- local/off_jsonl_linecount.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 local/off_jsonl_linecount.txt diff --git a/local/off_jsonl_linecount.txt b/local/off_jsonl_linecount.txt new file mode 100644 index 0000000..73383a4 --- /dev/null +++ b/local/off_jsonl_linecount.txt @@ -0,0 +1 @@ +4046118 From e213cf0e9240652d87426df5712e31b6b634dc3d Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Tue, 30 Sep 2025 13:53:20 +0530 Subject: [PATCH 03/21] Add SSE inventory+analysis stream and shared helpers (#4) --- supabase/functions/ingredicheck/analyzer.ts | 332 +++++++++++++----- supabase/functions/ingredicheck/index.ts | 1 + supabase/functions/ingredicheck/inventory.ts | 276 +++++++++------ supabase/functions/shared/db.ts | 4 + .../shared/llm/ingredientanalyzeragent.ts | 2 +- 5 files changed, 418 insertions(+), 197 deletions(-) diff --git a/supabase/functions/ingredicheck/analyzer.ts b/supabase/functions/ingredicheck/analyzer.ts index 6cda7b1..cddb36e 100644 --- a/supabase/functions/ingredicheck/analyzer.ts +++ b/supabase/functions/ingredicheck/analyzer.ts @@ -1,93 +1,257 @@ +import { Context } from "https://deno.land/x/oak@v12.6.0/mod.ts"; +import * as DB from "../shared/db.ts"; +import { + ingredientAnalyzerAgent, + IngredientRecommendation, +} from "../shared/llm/ingredientanalyzeragent.ts"; +import * as Inventory from "./inventory.ts"; -import { Context } from 'https://deno.land/x/oak@v12.6.0/mod.ts' -import * as DB from '../shared/db.ts' -import { ingredientAnalyzerAgent } from '../shared/llm/ingredientanalyzeragent.ts' +const MB = 1024 * 1024; -const MB = 1024 * 1024 +export type AnalysisRequest = { + barcode?: string; + userPreferenceText?: string; + clientActivityId?: string; +}; export async function analyze(ctx: Context) { + const startTime = new Date(); + let requestBody: AnalysisRequest = {}; + let responseBody: unknown = []; + let responseStatus = 200; - const startTime = new Date() - let requestBody: any = {} - let product = DB.defaultProduct() - - try { - const body = ctx.request.body({ type: "form-data" }) - const formData = await body.value.read({ maxSize: 10 * MB }) - - requestBody = { - barcode: formData.fields['barcode'], - userPreferenceText: formData.fields['userPreferenceText'], - clientActivityId: formData.fields['clientActivityId'] - } - - ctx.state.clientActivityId = requestBody.clientActivityId - - if (requestBody.barcode !== undefined) { - const result = await ctx.state.supabaseClient - .from('log_inventory') - .select() - .eq('barcode', requestBody.barcode) - .order('created_at', { ascending: false }) - .limit(1) - .single() - - if (result.error) { - throw result.error - } - - product = result.data as DB.Product - } else { - const result = await ctx.state.supabaseClient - .from('log_extract') - .select() - .eq('client_activity_id', ctx.state.clientActivityId) - .order('created_at', { ascending: false }) - .limit(1) - .single() - - if (result.error) { - throw result.error - } - - product = { - barcode: result.data.barcode, - brand: result.data.brand, - name: result.data.name, - ingredients: result.data.ingredients ?? [], - images: [] - } - } - - // Skip analyzer agent if user has no preferences set - const hasValidPreferences = requestBody.userPreferenceText && - requestBody.userPreferenceText.trim() !== "" && - requestBody.userPreferenceText.trim().toLowerCase() !== "none" - - const ingredientRecommendations = - product.ingredients && product.ingredients.length !== 0 && hasValidPreferences - ? await ingredientAnalyzerAgent(ctx, product, requestBody.userPreferenceText) - : [] - - ctx.response.status = 200 - ctx.response.body = ingredientRecommendations - } catch (error) { - ctx.response.status = 500 - ctx.response.body = error - } + try { + const body = ctx.request.body({ type: "form-data" }); + const formData = await body.value.read({ maxSize: 10 * MB }); + + requestBody = { + barcode: formData.fields["barcode"], + userPreferenceText: formData.fields["userPreferenceText"], + clientActivityId: formData.fields["clientActivityId"], + }; + + const result = await performAnalysis({ + ctx, + requestBody, + }); + + responseStatus = 200; + responseBody = result.recommendations; + } catch (error) { + responseStatus = 500; + responseBody = error; + } + + ctx.response.status = responseStatus; + ctx.response.body = responseBody; + + await logAnalysisResult( + ctx, + startTime, + requestBody, + responseStatus, + responseBody, + ); +} + +export async function streamInventoryAndAnalysis(ctx: Context) { + const barcode = ctx.params.barcode; + const clientActivityId = + ctx.request.url.searchParams.get("clientActivityId") ?? undefined; + const userPreferenceText = + ctx.request.url.searchParams.get("userPreferenceText") ?? undefined; + + const sse = ctx.sendEvents(); + + if (!barcode) { + sse.dispatchMessage({ + event: "error", + data: JSON.stringify({ message: "Barcode is required." }), + }); + sse.close(); + return; + } + + const inventoryResult = await Inventory.fetchProduct({ + supabaseClient: ctx.state.supabaseClient, + barcode, + clientActivityId, + }); + + if (inventoryResult.status !== 200 || !inventoryResult.product) { + const errorPayload = { + message: inventoryResult.error ?? "Product not found.", + status: inventoryResult.status, + }; + sse.dispatchMessage({ + event: "error", + data: JSON.stringify(errorPayload), + }); + sse.close(); + return; + } + + sse.dispatchMessage({ + event: "product", + data: JSON.stringify(inventoryResult.product), + }); + + const analysisStartTime = new Date(); + + const analysisRequest: AnalysisRequest = { + barcode, + userPreferenceText, + clientActivityId, + }; + + try { + const analysisResult = await performAnalysis({ + ctx, + requestBody: analysisRequest, + productOverride: inventoryResult.product, + }); + + sse.dispatchMessage({ + event: "analysis", + data: JSON.stringify(analysisResult.recommendations), + }); + + await logAnalysisResult( + ctx, + analysisStartTime, + analysisRequest, + 200, + analysisResult.recommendations, + ); + } catch (error) { + const message = error instanceof Error ? error.message : "Analysis failed."; + + sse.dispatchMessage({ + event: "error", + data: JSON.stringify({ message }), + }); - const endTime = new Date() + await logAnalysisResult( + ctx, + analysisStartTime, + analysisRequest, + 500, + { message }, + ); + } finally { + sse.dispatchComment("done"); + sse.close(); + } +} - ctx.state.supabaseClient.functions.invoke('background/log_analyzebarcode', { +type PerformAnalysisOptions = { + ctx: Context; + requestBody: AnalysisRequest; + productOverride?: DB.Product; +}; + +type PerformAnalysisResult = { + product: DB.Product; + recommendations: IngredientRecommendation[]; +}; + +export async function performAnalysis( + options: PerformAnalysisOptions, +): Promise { + const { ctx, requestBody, productOverride } = options; + + ctx.state.clientActivityId = requestBody.clientActivityId; + + const product = productOverride ?? await lookupProduct(ctx, requestBody); + + const hasValidPreferences = requestBody.userPreferenceText && + requestBody.userPreferenceText.trim() !== "" && + requestBody.userPreferenceText.trim().toLowerCase() !== "none"; + + const hasIngredients = Array.isArray(product.ingredients) && + product.ingredients.length > 0; + + const recommendations = hasValidPreferences && hasIngredients + ? await ingredientAnalyzerAgent( + ctx, + product, + requestBody.userPreferenceText!, + ) + : []; + + return { + product, + recommendations, + }; +} + +export async function logAnalysisResult( + ctx: Context, + startTime: Date, + requestBody: AnalysisRequest, + responseStatus: number, + responseBody: unknown, +) { + const endTime = new Date(); + + try { + await ctx.state.supabaseClient.functions.invoke( + "background/log_analyzebarcode", + { body: { - activity_id: ctx.state.activityId, - client_activity_id: ctx.state.clientActivityId, - start_time: startTime, - end_time: endTime, - request_body: requestBody, - response_status: ctx.response.status, - response_body: ctx.response.body + activity_id: ctx.state.activityId, + client_activity_id: ctx.state.clientActivityId, + start_time: startTime, + end_time: endTime, + request_body: requestBody, + response_status: responseStatus, + response_body: responseBody, }, - method: 'POST' - }) -} \ No newline at end of file + method: "POST", + }, + ); + } catch (error) { + console.error("Failed to log analyze barcode event", error); + } +} + +async function lookupProduct( + ctx: Context, + requestBody: AnalysisRequest, +): Promise { + if (requestBody.barcode !== undefined) { + const result = await ctx.state.supabaseClient + .from("log_inventory") + .select() + .eq("barcode", requestBody.barcode) + .order("created_at", { ascending: false }) + .limit(1) + .single(); + + if (result.error) { + throw result.error; + } + + return result.data as DB.Product; + } + + const result = await ctx.state.supabaseClient + .from("log_extract") + .select() + .eq("client_activity_id", ctx.state.clientActivityId) + .order("created_at", { ascending: false }) + .limit(1) + .single(); + + if (result.error) { + throw result.error; + } + + return { + barcode: result.data.barcode, + brand: result.data.brand, + name: result.data.name, + ingredients: result.data.ingredients ?? [], + images: [], + }; +} diff --git a/supabase/functions/ingredicheck/index.ts b/supabase/functions/ingredicheck/index.ts index c606bec..d4d0ccc 100644 --- a/supabase/functions/ingredicheck/index.ts +++ b/supabase/functions/ingredicheck/index.ts @@ -46,6 +46,7 @@ router } ctx.response.status = 204 }) + .get('/ingredicheck/inventory/:barcode/analyze-stream', Analyzer.streamInventoryAndAnalysis) .get('/ingredicheck/inventory/:barcode', async (ctx) => { const clientActivityId = ctx.request.url.searchParams.get("clientActivityId") await Inventory.get(ctx, ctx.params.barcode, clientActivityId) diff --git a/supabase/functions/ingredicheck/inventory.ts b/supabase/functions/ingredicheck/inventory.ts index 7f0bffa..5c188a5 100644 --- a/supabase/functions/ingredicheck/inventory.ts +++ b/supabase/functions/ingredicheck/inventory.ts @@ -1,124 +1,176 @@ -import { Context } from 'https://deno.land/x/oak@v12.6.0/mod.ts' -import * as DB from '../shared/db.ts' - -export async function get(ctx: Context, barcode: string, clientActivityId: string | null) { - - let result_json: any = {} - let log_json: any = { - start_time: new Date(), - barcode: barcode, - data_source: 'openfoodfacts/v3', - client_activity_id: clientActivityId, - } - - const url = `https://world.openfoodfacts.org/api/v3/product/${barcode}.json` - const response = await fetch(url) - const data = await response.json() - - if (data.status === 'failure') { - console.log(`Unexpected product details: ${JSON.stringify(data, null, 2)}`) - ctx.response.status = 404 +import { Context } from "https://deno.land/x/oak@v12.6.0/mod.ts"; +import * as DB from "../shared/db.ts"; + +type InventoryFetchOptions = { + supabaseClient: any; + barcode: string; + clientActivityId?: string | null; +}; + +type InventoryFetchResult = { + status: number; + product: DB.Product | null; + error?: string; +}; + +export async function fetchProduct( + options: InventoryFetchOptions, +): Promise { + const { supabaseClient, barcode, clientActivityId } = options; + + let product: DB.Product | null = null; + let errorMessage: string | undefined; + + const log_json: Record = { + start_time: new Date(), + barcode: barcode, + data_source: "openfoodfacts/v3", + client_activity_id: clientActivityId, + }; + + let status = 200; + + try { + const url = + `https://world.openfoodfacts.org/api/v3/product/${barcode}.json`; + const response = await fetch(url); + const data = await response.json(); + + if (data.status === "failure") { + console.log( + `Unexpected product details: ${JSON.stringify(data, null, 2)}`, + ); + status = 404; + errorMessage = data.status_verbose || "Product not found."; } else { - // console.log(`brand: ${data.product.brand_owner}`) - // console.log(`name: ${data.product.product_name}`) - // console.log(`ingredients: ${data.product.ingredients}`) - // console.log(`images: ${data.product.selected_images?.front?.display?.en}`) - result_json = processOpenFoodFactsProductData(barcode, data.product) - log_json = { - ...log_json, - ...result_json - } - ctx.response.status = 200 + product = processOpenFoodFactsProductData(barcode, data.product); + Object.assign(log_json, product); } + } catch (error) { + status = 500; + errorMessage = (error as Error).message; + console.error(`Failed to fetch product ${barcode}: ${errorMessage}`); + } + + log_json.end_time = new Date(); + log_json.response_status = status; + if (errorMessage) { + log_json.error = errorMessage; + } + + await supabaseClient.functions.invoke("background/log_inventory", { + body: log_json, + method: "POST", + }); + + return { + status, + product, + error: errorMessage, + }; +} - log_json.end_time = new Date() - - await ctx.state.supabaseClient.functions.invoke('background/log_inventory', { - body: log_json, - method: 'POST' - }) - - ctx.response.body = result_json +export async function get( + ctx: Context, + barcode: string, + clientActivityId: string | null, +) { + const result = await fetchProduct({ + supabaseClient: ctx.state.supabaseClient, + barcode, + clientActivityId, + }); + + ctx.response.status = result.status; + if (result.status === 200 && result.product) { + ctx.response.body = result.product; + } else { + ctx.response.body = { + error: result.error ?? "Unexpected inventory error.", + }; + } } type SelectedImages = { - [key: string]: { - display: { - [key: string]: string - } - } -} + [key: string]: { + display: { + [key: string]: string; + }; + }; +}; type ImageUrl = { - url: string -} + url: string; +}; function extractDisplayImageUrls(selectedImages?: SelectedImages): ImageUrl[] { - if (selectedImages) { - return Object.values(selectedImages).flatMap(image => { - if (image.display?.en) { - return [{ - url: image.display.en - }] - } - return [] - }) - } - return [] + if (selectedImages) { + return Object.values(selectedImages).flatMap((image) => { + if (image.display?.en) { + return [{ + url: image.display.en, + }]; + } + return []; + }); + } + return []; } -function processOpenFoodFactsProductData(barcode: string, product: any) : DB.Product { - - let brand: string | undefined = undefined - let name: string | undefined = undefined - let ingredients: any[] = [] - - if (product.brand_owner) { - brand = product.brand_owner - } - - if (product.product_name) { - name = product.product_name - } - - if (product.ingredients) { - ingredients = - product.ingredients.map((i: any) => { - return { - name: i.text, - vegan: i.vegan, - vegetarian: i.vegetarian, - ingredients: i.ingredients?.map((i2: any) => { - return { - name: i2.text, - vegan: i2.vegan, - vegetarian: i2.vegetarian, - ingredients: i2.ingredients?.map((i3: any) => { - return { - name: i3.text, - vegan: i3.vegan, - vegetarian: i3.vegetarian, - ingredients: [] - } - }) ?? [] - } - }) ?? [] - } - }) - } - - const images = extractDisplayImageUrls(product.selected_images) - - // Workaround for known issues with OpenFoodFacts data - if (barcode === '0096619362776') { - // Label says 'Contains No Animal Rennet', but ingredient list has 'Animal Rennet'. - ingredients = ingredients.filter((i) => i.name !== 'Animal Rennet') - } - - return { - brand: brand, - name: name, - ingredients: ingredients, - images: images - } -} \ No newline at end of file +function processOpenFoodFactsProductData( + barcode: string, + product: any, +): DB.Product { + let brand: string | undefined = undefined; + let name: string | undefined = undefined; + let ingredients: any[] = []; + + if (product.brand_owner) { + brand = product.brand_owner; + } + + if (product.product_name) { + name = product.product_name; + } + + if (product.ingredients) { + ingredients = product.ingredients.map((i: any) => { + return { + name: i.text, + vegan: i.vegan, + vegetarian: i.vegetarian, + ingredients: i.ingredients?.map((i2: any) => { + return { + name: i2.text, + vegan: i2.vegan, + vegetarian: i2.vegetarian, + ingredients: i2.ingredients?.map((i3: any) => { + return { + name: i3.text, + vegan: i3.vegan, + vegetarian: i3.vegetarian, + ingredients: [], + }; + }) ?? [], + }; + }) ?? [], + }; + }); + } + + const images = extractDisplayImageUrls(product.selected_images); + + // Workaround for known issues with OpenFoodFacts data + if (barcode === "0096619362776") { + // Label says 'Contains No Animal Rennet', but ingredient list has 'Animal Rennet'. + ingredients = ingredients.filter((i) => i.name !== "Animal Rennet"); + } + + return { + barcode: barcode, + brand: brand, + name: name, + ingredients: ingredients, + images: images, + }; +} diff --git a/supabase/functions/shared/db.ts b/supabase/functions/shared/db.ts index 878af75..c8908e8 100644 --- a/supabase/functions/shared/db.ts +++ b/supabase/functions/shared/db.ts @@ -22,6 +22,10 @@ export type Product = { export function defaultProduct(): Product { return { + barcode: undefined, + data_source: undefined, + brand: undefined, + name: undefined, ingredients: [], images: [], } diff --git a/supabase/functions/shared/llm/ingredientanalyzeragent.ts b/supabase/functions/shared/llm/ingredientanalyzeragent.ts index 6c79db5..e2a8b86 100644 --- a/supabase/functions/shared/llm/ingredientanalyzeragent.ts +++ b/supabase/functions/shared/llm/ingredientanalyzeragent.ts @@ -7,7 +7,7 @@ import { import { createGeminiProgram } from "./programs.ts"; import { ChatMessage } from "./types.ts"; -type IngredientRecommendation = { +export type IngredientRecommendation = { ingredientName: string; safetyRecommendation: "MaybeUnsafe" | "DefinitelyUnsafe"; reasoning: string; From 10e72c27a2d474eb330c8f6af79824d66d820495 Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Tue, 30 Sep 2025 15:46:59 +0530 Subject: [PATCH 04/21] feat(inventory): add Deno OFF ingest script and inventory_cache table; remove estimation artifacts --- supabase/database/tables.sql | 44 ++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/supabase/database/tables.sql b/supabase/database/tables.sql index 9d816f6..1b01d59 100644 --- a/supabase/database/tables.sql +++ b/supabase/database/tables.sql @@ -1,4 +1,48 @@ +<<<<<<< Current (Your changes) +======= +-------------------------------------------------------------------------------- + +create table + public.inventory_cache ( + created_at timestamp with time zone not null default now(), + updated_at timestamp with time zone not null default now(), + last_refreshed_at timestamp with time zone, + barcode text not null, + data_source text not null default 'openfoodfacts/v3', + name text, + brand text, + ingredients jsonb not null default '[]'::jsonb, + images jsonb not null default '[]'::jsonb, + off_last_modified_t bigint, + etag text, + constraint inventory_cache_pkey primary key (barcode) + ) tablespace pg_default; + +alter table public.inventory_cache enable row level security; + +create policy "Select for all authenticated users" on public.inventory_cache + for select + using (true); + +create policy "Write for service role only" on public.inventory_cache + for ALL + using (auth.role() = 'service_role') + with check (auth.role() = 'service_role'); + +create or replace function set_inventory_cache_updated_at() +returns trigger as $$ +begin + new.updated_at = now(); + return new; +end; +$$ language plpgsql; + +create trigger trg_inventory_cache_updated_at +before update on public.inventory_cache +for each row execute function set_inventory_cache_updated_at(); + +>>>>>>> Incoming (Background Agent changes) -------------------------------------------------------------------------------- create table From 62805fd388e5115c689193dabe837d84b27ad8be Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Tue, 30 Sep 2025 16:04:06 +0530 Subject: [PATCH 05/21] feat: add Deno OFF ingest script for inventory caching --- local/off_ingest.ts | 292 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 292 insertions(+) create mode 100644 local/off_ingest.ts diff --git a/local/off_ingest.ts b/local/off_ingest.ts new file mode 100644 index 0000000..d1f072a --- /dev/null +++ b/local/off_ingest.ts @@ -0,0 +1,292 @@ +// deno run -A --import-map=local/import_map.json local/off_ingest.ts +// Environment: SUPABASE_URL, SUPABASE_SECRET_KEY must be set for upload +// Note: SUPABASE_SECRET_KEY is the new key type (replaces SUPABASE_SERVICE_ROLE_KEY) + +import { createClient } from "@supabase/supabase-js"; + +type Ingredient = { + name: string; + vegan?: boolean; + vegetarian?: boolean; + ingredients?: Ingredient[]; +}; + +type Image = { url: string }; + +type CacheRow = { + barcode: string; + data_source: string; + brand?: string; + name?: string; + ingredients: Ingredient[]; + images: Image[]; + off_last_modified_t?: number; +}; + +const OFF_JSONL_GZ_URL = "https://static.openfoodfacts.org/data/openfoodfacts-products.jsonl.gz"; +const OUTPUT_PATH = "local/off_inventory_cache.jsonl"; +const BATCH_UPLOAD_SIZE = 500; + +function mapIngredient(node: any): Ingredient { + const item: Ingredient = { + name: typeof node?.text === "string" ? node.text : undefined as unknown as string, + vegan: node?.vegan, + vegetarian: node?.vegetarian, + ingredients: [], + }; + if (Array.isArray(node?.ingredients) && node.ingredients.length > 0) { + item.ingredients = node.ingredients.filter((x: any) => x && typeof x === "object").map(mapIngredient); + } + return item; +} + +function extractDisplayImageUrls(selectedImages: any): Image[] { + if (!selectedImages || typeof selectedImages !== "object") return []; + const urls: Image[] = []; + try { + for (const value of Object.values(selectedImages as Record)) { + const display = (value as any)?.display; + if (display && typeof display === "object") { + if (typeof display.en === "string" && display.en) { + urls.push({ url: display.en }); + } else { + for (const v of Object.values(display)) { + if (typeof v === "string" && v) { + urls.push({ url: v }); + break; + } + } + } + } + } + } catch (_) { + // ignore malformed structures + } + return urls; +} + +function mapToCacheRow(product: any): CacheRow | null { + const dataSource = "openfoodfacts/v3"; + + let barcode: string | undefined; + const code = product?.code; + if (typeof code === "string" && code.trim()) { + barcode = code.trim(); + } else if (typeof code === "number") { + barcode = String(code); + } else if (typeof product?._id === "string" && product._id.trim()) { + barcode = product._id.trim(); + } + if (!barcode) return null; + + let brand: string | undefined; + if (typeof product?.brand_owner === "string" && product.brand_owner.trim()) { + brand = product.brand_owner.trim(); + } else if (typeof product?.brands === "string" && product.brands.trim()) { + brand = product.brands.split(",")[0]?.trim(); + } + + let name: string | undefined; + if (typeof product?.product_name === "string" && product.product_name.trim()) { + name = product.product_name.trim(); + } else { + for (const [k, v] of Object.entries(product ?? {})) { + if (k.startsWith("product_name_") && typeof v === "string" && (v as string).trim()) { + name = (v as string).trim(); + break; + } + } + } + + let ingredients: Ingredient[] = []; + if (Array.isArray(product?.ingredients) && product.ingredients.length > 0) { + ingredients = product.ingredients.filter((x: any) => x && typeof x === "object").map(mapIngredient); + } + + const images = extractDisplayImageUrls(product?.selected_images); + const off_last_modified_t = typeof product?.last_modified_t === "number" ? product.last_modified_t : undefined; + + return { + barcode, + data_source: dataSource, + brand, + name, + ingredients, + images, + off_last_modified_t, + }; +} + +async function* iterLinesFromGzip(url: string): AsyncGenerator { + const res = await fetch(url); + if (!res.body) throw new Error("No response body from OFF"); + const decompressed = res.body.pipeThrough(new DecompressionStream("gzip")); + const textStream = decompressed.pipeThrough(new TextDecoderStream()); + const reader = textStream.getReader(); + let buf = ""; + try { + while (true) { + const { value, done } = await reader.read(); + if (done) break; + buf += value ?? ""; + let idx: number; + while ((idx = buf.indexOf("\n")) !== -1) { + const line = buf.slice(0, idx); + buf = buf.slice(idx + 1); + yield line; + } + } + if (buf.length > 0) { + yield buf; + } + } finally { + reader.releaseLock(); + } +} + +async function writeJsonl(rows: AsyncIterable, outPath: string): Promise<{ count: number; totalBytes: number; nonEmpty: { brand: number; name: number; ingredients: number; images: number } }> { + const file = await Deno.open(outPath, { create: true, write: true, truncate: true }); + const encoder = new TextEncoder(); + let count = 0; + let totalBytes = 0; + let nonBrand = 0, nonName = 0, nonIng = 0, nonImg = 0; + try { + for await (const row of rows) { + count++; + if (row.brand) nonBrand++; + if (row.name) nonName++; + if (row.ingredients && row.ingredients.length) nonIng++; + if (row.images && row.images.length) nonImg++; + const json = JSON.stringify(row); + totalBytes += encoder.encode(json).byteLength; + await file.write(encoder.encode(json + "\n")); + } + } finally { + file.close(); + } + return { count, totalBytes, nonEmpty: { brand: nonBrand, name: nonName, ingredients: nonIng, images: nonImg } }; +} + +async function* projectRows(lines: AsyncIterable): AsyncGenerator { + for await (const line of lines) { + const trimmed = line.trim(); + if (!trimmed) continue; + try { + const product = JSON.parse(trimmed); + const row = mapToCacheRow(product); + if (row && row.barcode) { + yield row; + } + } catch (_) { + // skip invalid lines + } + } +} + +function formatBytes(bytes: number): string { + const units = ["B", "KB", "MB", "GB", "TB"] as const; + let i = 0; + let n = bytes; + while (n >= 1024 && i < units.length - 1) { + n /= 1024; + i++; + } + return `${n.toFixed(2)} ${units[i]}`; +} + +async function askUploadPermission(stats: { count: number; totalBytes: number }): Promise { + console.log("\nSummary:"); + console.log(` Rows: ${stats.count}`); + console.log(` Payload size (JSON only): ${formatBytes(stats.totalBytes)} (~${Math.round(stats.totalBytes / Math.max(1, stats.count))} B/row)`); + const answer = confirm("Upload to Supabase inventory_cache? This can take a long time. (y/N)"); + return !!answer; +} + +async function uploadJsonlToSupabase(path: string) { + const url = Deno.env.get("SUPABASE_URL") ?? ""; + const key = Deno.env.get("SUPABASE_SECRET_KEY") ?? Deno.env.get("SUPABASE_SERVICE_ROLE_KEY") ?? ""; + if (!url || !key) throw new Error("SUPABASE_URL and SUPABASE_SECRET_KEY (or SUPABASE_SERVICE_ROLE_KEY for legacy) must be set in environment"); + const supabase = createClient(url, key, { auth: { persistSession: false } }); + + const file = await Deno.open(path, { read: true }); + const decoder = new TextDecoder(); + const bufSize = 1024 * 1024; + const buf = new Uint8Array(bufSize); + let pending: any[] = []; + let leftover = ""; + let total = 0; + try { + while (true) { + const read = await file.read(buf); + if (read === null) break; + const chunk = decoder.decode(buf.subarray(0, read)); + let data = leftover + chunk; + let idx: number; + while ((idx = data.indexOf("\n")) !== -1) { + const line = data.slice(0, idx); + data = data.slice(idx + 1); + if (!line) continue; + try { + const row = JSON.parse(line); + // Set last_refreshed_at during upsert + row.last_refreshed_at = new Date().toISOString(); + pending.push(row); + total++; + if (pending.length >= BATCH_UPLOAD_SIZE) { + const { error } = await supabase.from("inventory_cache").upsert(pending, { onConflict: "barcode" }); + if (error) throw error; + pending = []; + } + } catch (_) { + // skip + } + } + leftover = data; + } + if (leftover.trim()) { + try { + const row = JSON.parse(leftover.trim()); + row.last_refreshed_at = new Date().toISOString(); + pending.push(row); + total++; + } catch (_) {} + } + if (pending.length) { + const { error } = await supabase.from("inventory_cache").upsert(pending, { onConflict: "barcode" }); + if (error) throw error; + } + } finally { + file.close(); + } + console.log(`Uploaded/Upserted ${total} rows to inventory_cache`); +} + +async function main() { + console.log("Downloading OFF JSONL.gz and streaming transform..."); + const rows = projectRows(iterLinesFromGzip(OFF_JSONL_GZ_URL)); + const stats = await writeJsonl(rows, OUTPUT_PATH); + console.log("\nField coverage (non-empty counts):"); + console.log(` brand: ${stats.nonEmpty.brand}`); + console.log(` name: ${stats.nonEmpty.name}`); + console.log(` ingredients: ${stats.nonEmpty.ingredients}`); + console.log(` images: ${stats.nonEmpty.images}`); + const proceed = await askUploadPermission(stats); + if (!proceed) { + console.log("Upload skipped."); + return; + } + console.log("\nUploading to Supabase (batched upserts)..."); + const start = Date.now(); + await uploadJsonlToSupabase(OUTPUT_PATH); + const elapsed = (Date.now() - start) / 1000; + console.log(`Done in ${elapsed.toFixed(1)}s.`); +} + +if (import.meta.main) { + await main().catch((err) => { + console.error("Error:", err); + Deno.exit(1); + }); +} + + From 028cbb144e2ae55381410edc00904d734251e89d Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Tue, 30 Sep 2025 16:06:22 +0530 Subject: [PATCH 06/21] chore: remove extraneous estimator files from PR --- local/off_estimator.py | 248 -------------------------------- local/off_estimator_sample.json | 18 --- local/off_jsonl_linecount.txt | 1 - 3 files changed, 267 deletions(-) delete mode 100644 local/off_estimator.py delete mode 100644 local/off_estimator_sample.json delete mode 100644 local/off_jsonl_linecount.txt diff --git a/local/off_estimator.py b/local/off_estimator.py deleted file mode 100644 index 60f3d77..0000000 --- a/local/off_estimator.py +++ /dev/null @@ -1,248 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -import gzip -import io -import json -import sys -import time -from typing import Any, Dict, Iterable, List, Optional, Tuple - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Stream OFF JSONL.gz and estimate projected payload sizes.") - source = parser.add_mutually_exclusive_group(required=True) - source.add_argument("--stdin", action="store_true", help="Read gzipped JSONL from stdin") - source.add_argument("--input", type=str, help="Path to gzipped JSONL file") - parser.add_argument("--sample", type=int, default=0, help="Number of records to process for sampling (0 = full stream)") - parser.add_argument("--output", type=str, default="", help="Optional path to write JSON report") - return parser.parse_args() - - -def iter_gz_lines_from_stdin() -> Iterable[str]: - gz = gzip.GzipFile(fileobj=sys.stdin.buffer, mode="rb") - with io.TextIOWrapper(gz, encoding="utf-8", errors="ignore", newline="\n") as f: - for line in f: - yield line - - -def iter_gz_lines_from_file(path: str) -> Iterable[str]: - with gzip.open(path, mode="rt", encoding="utf-8", errors="ignore", newline="\n") as f: - for line in f: - yield line - - -def extract_display_image_urls(selected_images: Optional[Dict[str, Any]]) -> List[Dict[str, str]]: - if not isinstance(selected_images, dict): - return [] - urls: List[Dict[str, str]] = [] - try: - for image in selected_images.values(): - display = image.get("display") if isinstance(image, dict) else None - if isinstance(display, dict): - # Prefer English if available, else any string value - if isinstance(display.get("en"), str): - urls.append({"url": display["en"]}) - else: - for v in display.values(): - if isinstance(v, str): - urls.append({"url": v}) - break - except Exception: - # Be tolerant to odd structures - return urls - return urls - - -def map_ingredient_node(node: Dict[str, Any]) -> Dict[str, Any]: - mapped: Dict[str, Any] = { - "name": node.get("text"), - "vegan": node.get("vegan"), - "vegetarian": node.get("vegetarian"), - "ingredients": [], - } - sub = node.get("ingredients") - if isinstance(sub, list) and len(sub) > 0: - mapped["ingredients"] = [map_ingredient_node(child) for child in sub if isinstance(child, dict)] - return mapped - - -def extract_projection(product: Dict[str, Any]) -> Dict[str, Any]: - # Barcode / code - barcode: Optional[str] = None - code = product.get("code") - if isinstance(code, str) and code.strip(): - barcode = code.strip() - elif isinstance(code, (int, float)): - barcode = str(code) - elif isinstance(product.get("_id"), str): - barcode = product.get("_id") - - # Brand - brand: Optional[str] = None - brand_owner = product.get("brand_owner") - if isinstance(brand_owner, str) and brand_owner.strip(): - brand = brand_owner.strip() - else: - brands = product.get("brands") - if isinstance(brands, str) and brands.strip(): - # OFF brands is comma-separated; take the first token - brand = brands.split(",")[0].strip() - - # Name - name: Optional[str] = None - product_name = product.get("product_name") - if isinstance(product_name, str) and product_name.strip(): - name = product_name.strip() - else: - # Try language-specific variants if present - for k, v in product.items(): - if k.startswith("product_name_") and isinstance(v, str) and v.strip(): - name = v.strip() - break - - # Ingredients - ingredients_list: List[Dict[str, Any]] = [] - raw_ingredients = product.get("ingredients") - if isinstance(raw_ingredients, list) and len(raw_ingredients) > 0: - ingredients_list = [map_ingredient_node(node) for node in raw_ingredients if isinstance(node, dict)] - - # Images - images: List[Dict[str, str]] = [] - selected_images = product.get("selected_images") - images = extract_display_image_urls(selected_images) - - return { - "barcode": barcode, - "brand": brand, - "name": name, - "ingredients": ingredients_list, - "images": images, - } - - -def json_bytes(value: Any) -> int: - try: - return len(json.dumps(value, ensure_ascii=False, separators=(",", ":")).encode("utf-8")) - except Exception: - return 0 - - -def utf8_bytes(value: Optional[str]) -> int: - if value is None: - return 0 - try: - return len(value.encode("utf-8")) - except Exception: - return 0 - - -def run(lines: Iterable[str], sample: int = 0) -> Dict[str, Any]: - start = time.time() - - total_records = 0 - projected_records = 0 - barcode_bytes_total = 0 - brand_bytes_total = 0 - name_bytes_total = 0 - ingredients_bytes_total = 0 - images_bytes_total = 0 - - nonempty_brand = 0 - nonempty_name = 0 - nonempty_ingredients = 0 - nonempty_images = 0 - - processed = 0 - for raw in lines: - if sample and processed >= sample: - break - total_records += 1 - raw = raw.strip() - if not raw: - continue - try: - product = json.loads(raw) - except Exception: - continue - - proj = extract_projection(product) - if proj.get("barcode"): - projected_records += 1 - barcode_bytes_total += utf8_bytes(proj.get("barcode")) - - b = proj.get("brand") - if b: - nonempty_brand += 1 - brand_bytes_total += utf8_bytes(b) - - n = proj.get("name") - if n: - nonempty_name += 1 - name_bytes_total += utf8_bytes(n) - - ing = proj.get("ingredients") - if isinstance(ing, list) and len(ing) > 0: - nonempty_ingredients += 1 - ingredients_bytes_total += json_bytes(ing) - - imgs = proj.get("images") - if isinstance(imgs, list) and len(imgs) > 0: - nonempty_images += 1 - images_bytes_total += json_bytes(imgs) - - processed += 1 - - elapsed = time.time() - start - - result = { - "total_records": total_records, - "projected_records_with_barcode": projected_records, - "barcode_bytes_total": barcode_bytes_total, - "brand_bytes_total": brand_bytes_total, - "name_bytes_total": name_bytes_total, - "ingredients_bytes_total": ingredients_bytes_total, - "images_bytes_total": images_bytes_total, - "nonempty_counts": { - "brand": nonempty_brand, - "name": nonempty_name, - "ingredients": nonempty_ingredients, - "images": nonempty_images, - }, - "elapsed_seconds": elapsed, - } - - totals_payload = ( - barcode_bytes_total - + brand_bytes_total - + name_bytes_total - + ingredients_bytes_total - + images_bytes_total - ) - result["projected_payload_bytes_total"] = totals_payload - result["avg_payload_bytes_per_projected_row"] = ( - (totals_payload / projected_records) if projected_records else 0.0 - ) - - return result - - -def main() -> None: - args = parse_args() - if args.stdin: - lines = iter_gz_lines_from_stdin() - else: - lines = iter_gz_lines_from_file(args.input) - - result = run(lines, sample=args.sample) - - if args.output: - with open(args.output, "w", encoding="utf-8") as f: - json.dump(result, f, ensure_ascii=False, indent=2) - else: - print(json.dumps(result, ensure_ascii=False, indent=2)) - - -if __name__ == "__main__": - main() - diff --git a/local/off_estimator_sample.json b/local/off_estimator_sample.json deleted file mode 100644 index 5e3bf48..0000000 --- a/local/off_estimator_sample.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "total_records": 100000, - "projected_records_with_barcode": 100000, - "barcode_bytes_total": 1288365, - "brand_bytes_total": 1796855, - "name_bytes_total": 2689661, - "ingredients_bytes_total": 108627317, - "images_bytes_total": 0, - "nonempty_counts": { - "brand": 98819, - "name": 99632, - "ingredients": 96406, - "images": 0 - }, - "elapsed_seconds": 53.94886898994446, - "projected_payload_bytes_total": 114402198, - "avg_payload_bytes_per_projected_row": 1144.02198 -} diff --git a/local/off_jsonl_linecount.txt b/local/off_jsonl_linecount.txt deleted file mode 100644 index 73383a4..0000000 --- a/local/off_jsonl_linecount.txt +++ /dev/null @@ -1 +0,0 @@ -4046118 From c48d7a231174705e804f48ba6428afc6ca440846 Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Tue, 30 Sep 2025 16:17:47 +0530 Subject: [PATCH 07/21] fix: resolve merge conflict in tables.sql --- supabase/database/tables.sql | 3 --- 1 file changed, 3 deletions(-) diff --git a/supabase/database/tables.sql b/supabase/database/tables.sql index 1b01d59..14f91a5 100644 --- a/supabase/database/tables.sql +++ b/supabase/database/tables.sql @@ -1,6 +1,4 @@ -<<<<<<< Current (Your changes) -======= -------------------------------------------------------------------------------- create table @@ -42,7 +40,6 @@ create trigger trg_inventory_cache_updated_at before update on public.inventory_cache for each row execute function set_inventory_cache_updated_at(); ->>>>>>> Incoming (Background Agent changes) -------------------------------------------------------------------------------- create table From 97505a78c906ecd8959c47c4dd1683b08dac9ea3 Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Tue, 30 Sep 2025 16:34:00 +0530 Subject: [PATCH 08/21] feat: add .env template and load env vars in off_ingest script --- .gitignore | 1 + local/.env.template | 12 ++++++++++++ local/off_ingest.ts | 27 +++++++++++++++++++++++++-- 3 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 local/.env.template diff --git a/.gitignore b/.gitignore index 356448e..508fd10 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ local/finetuning/preferencevalidatordataset/.env +local/.env diff --git a/local/.env.template b/local/.env.template new file mode 100644 index 0000000..91e50fb --- /dev/null +++ b/local/.env.template @@ -0,0 +1,12 @@ +# Supabase Configuration +# Copy this file to .env and fill in your actual values + +# Your Supabase project URL +SUPABASE_URL=https://your-project-ref.supabase.co + +# Your Supabase secret key (new key type, replaces SUPABASE_SERVICE_ROLE_KEY) +# Get this from your Supabase dashboard > Settings > API +SUPABASE_SECRET_KEY=your_secret_key_here + +# Legacy fallback (if you haven't migrated to new keys yet) +# SUPABASE_SERVICE_ROLE_KEY=your_legacy_service_role_key_here diff --git a/local/off_ingest.ts b/local/off_ingest.ts index d1f072a..4437c28 100644 --- a/local/off_ingest.ts +++ b/local/off_ingest.ts @@ -1,9 +1,29 @@ // deno run -A --import-map=local/import_map.json local/off_ingest.ts -// Environment: SUPABASE_URL, SUPABASE_SECRET_KEY must be set for upload -// Note: SUPABASE_SECRET_KEY is the new key type (replaces SUPABASE_SERVICE_ROLE_KEY) +// Environment: Copy .env.template to .env and fill in your values import { createClient } from "@supabase/supabase-js"; +// Load environment variables from .env file +async function loadEnv() { + try { + const envText = await Deno.readTextFile("local/.env"); + const lines = envText.split("\n"); + for (const line of lines) { + const trimmed = line.trim(); + if (trimmed && !trimmed.startsWith("#")) { + const [key, ...valueParts] = trimmed.split("="); + if (key && valueParts.length > 0) { + const value = valueParts.join("=").trim(); + Deno.env.set(key.trim(), value); + } + } + } + } catch (error) { + console.warn("āš ļø Could not load .env file:", error.message); + console.warn(" Make sure to copy .env.template to .env and fill in your values"); + } +} + type Ingredient = { name: string; vegan?: boolean; @@ -262,6 +282,9 @@ async function uploadJsonlToSupabase(path: string) { } async function main() { + // Load environment variables from .env file + await loadEnv(); + console.log("Downloading OFF JSONL.gz and streaming transform..."); const rows = projectRows(iterLinesFromGzip(OFF_JSONL_GZ_URL)); const stats = await writeJsonl(rows, OUTPUT_PATH); From 54622bfded027af044f4d7cd410ed16a056f4ea2 Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Tue, 30 Sep 2025 17:07:45 +0530 Subject: [PATCH 09/21] refactor: use npm: specifier for standalone script (no import map needed) --- local/off_ingest.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/local/off_ingest.ts b/local/off_ingest.ts index 4437c28..aa12dfc 100644 --- a/local/off_ingest.ts +++ b/local/off_ingest.ts @@ -1,7 +1,7 @@ -// deno run -A --import-map=local/import_map.json local/off_ingest.ts +// deno run -A local/off_ingest.ts // Environment: Copy .env.template to .env and fill in your values -import { createClient } from "@supabase/supabase-js"; +import { createClient } from "npm:@supabase/supabase-js@2.39.3"; // Load environment variables from .env file async function loadEnv() { From a7ad82312fcfc4085e8a970d6157f9990a4e33e9 Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Tue, 30 Sep 2025 17:08:56 +0530 Subject: [PATCH 10/21] feat: add batch upload with user confirmation every 5 batches --- local/off_ingest.ts | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/local/off_ingest.ts b/local/off_ingest.ts index aa12dfc..b6d01a7 100644 --- a/local/off_ingest.ts +++ b/local/off_ingest.ts @@ -45,7 +45,8 @@ type CacheRow = { const OFF_JSONL_GZ_URL = "https://static.openfoodfacts.org/data/openfoodfacts-products.jsonl.gz"; const OUTPUT_PATH = "local/off_inventory_cache.jsonl"; -const BATCH_UPLOAD_SIZE = 500; +const BATCH_UPLOAD_SIZE = 1000; +const BATCHES_PER_CONFIRMATION = 5; // Upload 5 batches (5000 rows) before asking for confirmation function mapIngredient(node: any): Ingredient { const item: Ingredient = { @@ -235,6 +236,9 @@ async function uploadJsonlToSupabase(path: string) { let pending: any[] = []; let leftover = ""; let total = 0; + let batchCount = 0; + let uploadedBatches = 0; + try { while (true) { const read = await file.read(buf); @@ -252,17 +256,37 @@ async function uploadJsonlToSupabase(path: string) { row.last_refreshed_at = new Date().toISOString(); pending.push(row); total++; + if (pending.length >= BATCH_UPLOAD_SIZE) { + // Upload this batch const { error } = await supabase.from("inventory_cache").upsert(pending, { onConflict: "barcode" }); if (error) throw error; + + batchCount++; + uploadedBatches++; + console.log(`āœ… Uploaded batch ${batchCount} (${pending.length} rows) - Total: ${total} rows`); + + // Check if we need confirmation + if (uploadedBatches >= BATCHES_PER_CONFIRMATION) { + console.log(`\nšŸ“Š Progress: ${total} rows uploaded in ${batchCount} batches`); + const continueUpload = confirm(`Continue uploading? (${total} rows uploaded so far) [y/N]`); + if (!continueUpload) { + console.log("āŒ Upload cancelled by user"); + return; + } + uploadedBatches = 0; // Reset counter + } + pending = []; } } catch (_) { - // skip + // skip invalid JSON } } leftover = data; } + + // Handle remaining data if (leftover.trim()) { try { const row = JSON.parse(leftover.trim()); @@ -271,14 +295,19 @@ async function uploadJsonlToSupabase(path: string) { total++; } catch (_) {} } + + // Upload final batch if any if (pending.length) { const { error } = await supabase.from("inventory_cache").upsert(pending, { onConflict: "barcode" }); if (error) throw error; + batchCount++; + console.log(`āœ… Uploaded final batch ${batchCount} (${pending.length} rows)`); } } finally { file.close(); } - console.log(`Uploaded/Upserted ${total} rows to inventory_cache`); + + console.log(`\nšŸŽ‰ Upload complete! Total: ${total} rows uploaded in ${batchCount} batches`); } async function main() { From 51b9068d7fbf53db83524e241875415f2fe64674 Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Tue, 30 Sep 2025 17:42:22 +0530 Subject: [PATCH 11/21] feat: add progress indicators for download and processing --- local/off_ingest.ts | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/local/off_ingest.ts b/local/off_ingest.ts index b6d01a7..f583f75 100644 --- a/local/off_ingest.ts +++ b/local/off_ingest.ts @@ -139,12 +139,21 @@ function mapToCacheRow(product: any): CacheRow | null { } async function* iterLinesFromGzip(url: string): AsyncGenerator { + console.log("šŸ“„ Downloading Open Food Facts data..."); const res = await fetch(url); if (!res.body) throw new Error("No response body from OFF"); + + const contentLength = res.headers.get("content-length"); + const totalBytes = contentLength ? parseInt(contentLength) : 0; + console.log(`šŸ“¦ File size: ${totalBytes > 0 ? formatBytes(totalBytes) : "unknown"}`); + const decompressed = res.body.pipeThrough(new DecompressionStream("gzip")); const textStream = decompressed.pipeThrough(new TextDecoderStream()); const reader = textStream.getReader(); let buf = ""; + let lineCount = 0; + let lastProgressTime = Date.now(); + try { while (true) { const { value, done } = await reader.read(); @@ -154,23 +163,37 @@ async function* iterLinesFromGzip(url: string): AsyncGenerator { while ((idx = buf.indexOf("\n")) !== -1) { const line = buf.slice(0, idx); buf = buf.slice(idx + 1); + lineCount++; + + // Show progress every 50k lines or every 10 seconds + const now = Date.now(); + if (lineCount % 50000 === 0 || now - lastProgressTime > 10000) { + console.log(`šŸ“Š Processed ${lineCount.toLocaleString()} products...`); + lastProgressTime = now; + } + yield line; } } if (buf.length > 0) { + lineCount++; yield buf; } } finally { reader.releaseLock(); } + console.log(`āœ… Download complete! Processed ${lineCount.toLocaleString()} products`); } async function writeJsonl(rows: AsyncIterable, outPath: string): Promise<{ count: number; totalBytes: number; nonEmpty: { brand: number; name: number; ingredients: number; images: number } }> { + console.log("šŸ’¾ Writing transformed data to local file..."); const file = await Deno.open(outPath, { create: true, write: true, truncate: true }); const encoder = new TextEncoder(); let count = 0; let totalBytes = 0; let nonBrand = 0, nonName = 0, nonIng = 0, nonImg = 0; + let lastProgressTime = Date.now(); + try { for await (const row of rows) { count++; @@ -181,10 +204,18 @@ async function writeJsonl(rows: AsyncIterable, outPath: string): Promi const json = JSON.stringify(row); totalBytes += encoder.encode(json).byteLength; await file.write(encoder.encode(json + "\n")); + + // Show progress every 25k rows or every 5 seconds + const now = Date.now(); + if (count % 25000 === 0 || now - lastProgressTime > 5000) { + console.log(`šŸ“ Written ${count.toLocaleString()} products to local file...`); + lastProgressTime = now; + } } } finally { file.close(); } + console.log(`āœ… Local file complete! ${count.toLocaleString()} products written`); return { count, totalBytes, nonEmpty: { brand: nonBrand, name: nonName, ingredients: nonIng, images: nonImg } }; } From e4140316bd200700045e65496f1a64d4731c9f30 Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Tue, 30 Sep 2025 17:46:59 +0530 Subject: [PATCH 12/21] feat: add detailed validation statistics to track invalid/empty products --- local/off_ingest.ts | 58 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 8 deletions(-) diff --git a/local/off_ingest.ts b/local/off_ingest.ts index f583f75..4bdfabe 100644 --- a/local/off_ingest.ts +++ b/local/off_ingest.ts @@ -185,7 +185,7 @@ async function* iterLinesFromGzip(url: string): AsyncGenerator { console.log(`āœ… Download complete! Processed ${lineCount.toLocaleString()} products`); } -async function writeJsonl(rows: AsyncIterable, outPath: string): Promise<{ count: number; totalBytes: number; nonEmpty: { brand: number; name: number; ingredients: number; images: number } }> { +async function writeJsonl(rows: AsyncIterable<{ row: CacheRow; stats: any }>, outPath: string): Promise<{ count: number; totalBytes: number; nonEmpty: { brand: number; name: number; ingredients: number; images: number }; validationStats: any }> { console.log("šŸ’¾ Writing transformed data to local file..."); const file = await Deno.open(outPath, { create: true, write: true, truncate: true }); const encoder = new TextEncoder(); @@ -193,9 +193,10 @@ async function writeJsonl(rows: AsyncIterable, outPath: string): Promi let totalBytes = 0; let nonBrand = 0, nonName = 0, nonIng = 0, nonImg = 0; let lastProgressTime = Date.now(); + let lastStats: any = null; try { - for await (const row of rows) { + for await (const { row, stats } of rows) { count++; if (row.brand) nonBrand++; if (row.name) nonName++; @@ -204,33 +205,62 @@ async function writeJsonl(rows: AsyncIterable, outPath: string): Promi const json = JSON.stringify(row); totalBytes += encoder.encode(json).byteLength; await file.write(encoder.encode(json + "\n")); + lastStats = stats; // Show progress every 25k rows or every 5 seconds const now = Date.now(); if (count % 25000 === 0 || now - lastProgressTime > 5000) { - console.log(`šŸ“ Written ${count.toLocaleString()} products to local file...`); + const validRate = ((lastStats.validProducts / lastStats.totalLines) * 100).toFixed(1); + const invalidRate = (((lastStats.emptyLines + lastStats.jsonParseErrors + lastStats.noBarcode) / lastStats.totalLines) * 100).toFixed(1); + console.log(`šŸ“ Written ${count.toLocaleString()} products (${validRate}% valid, ${invalidRate}% invalid)`); lastProgressTime = now; } } } finally { file.close(); } + + // Final validation statistics + if (lastStats) { + console.log(`\nšŸ“Š Validation Statistics:`); + console.log(` Total lines processed: ${lastStats.totalLines.toLocaleString()}`); + console.log(` Valid products: ${lastStats.validProducts.toLocaleString()} (${((lastStats.validProducts / lastStats.totalLines) * 100).toFixed(1)}%)`); + console.log(` Invalid products: ${(lastStats.emptyLines + lastStats.jsonParseErrors + lastStats.noBarcode).toLocaleString()} (${(((lastStats.emptyLines + lastStats.jsonParseErrors + lastStats.noBarcode) / lastStats.totalLines) * 100).toFixed(1)}%)`); + console.log(` - Empty lines: ${lastStats.emptyLines.toLocaleString()}`); + console.log(` - JSON parse errors: ${lastStats.jsonParseErrors.toLocaleString()}`); + console.log(` - No barcode: ${lastStats.noBarcode.toLocaleString()}`); + } + console.log(`āœ… Local file complete! ${count.toLocaleString()} products written`); - return { count, totalBytes, nonEmpty: { brand: nonBrand, name: nonName, ingredients: nonIng, images: nonImg } }; + return { count, totalBytes, nonEmpty: { brand: nonBrand, name: nonName, ingredients: nonIng, images: nonImg }, validationStats: lastStats }; } -async function* projectRows(lines: AsyncIterable): AsyncGenerator { +async function* projectRows(lines: AsyncIterable): AsyncGenerator<{ row: CacheRow; stats: { totalLines: number; emptyLines: number; jsonParseErrors: number; noBarcode: number; validProducts: number } }> { + let totalLines = 0; + let emptyLines = 0; + let jsonParseErrors = 0; + let noBarcode = 0; + let validProducts = 0; + for await (const line of lines) { + totalLines++; const trimmed = line.trim(); - if (!trimmed) continue; + if (!trimmed) { + emptyLines++; + continue; + } + try { const product = JSON.parse(trimmed); const row = mapToCacheRow(product); if (row && row.barcode) { - yield row; + validProducts++; + yield { row, stats: { totalLines, emptyLines, jsonParseErrors, noBarcode, validProducts } }; + } else { + noBarcode++; } } catch (_) { - // skip invalid lines + jsonParseErrors++; } } } @@ -353,6 +383,18 @@ async function main() { console.log(` name: ${stats.nonEmpty.name}`); console.log(` ingredients: ${stats.nonEmpty.ingredients}`); console.log(` images: ${stats.nonEmpty.images}`); + + // Show validation summary + if (stats.validationStats) { + console.log("\nšŸ“Š Data Quality Summary:"); + const validRate = ((stats.validationStats.validProducts / stats.validationStats.totalLines) * 100).toFixed(1); + console.log(` Success rate: ${validRate}% (${stats.validationStats.validProducts.toLocaleString()} valid out of ${stats.validationStats.totalLines.toLocaleString()} total)`); + console.log(` Invalid breakdown:`); + console.log(` - Empty lines: ${stats.validationStats.emptyLines.toLocaleString()}`); + console.log(` - JSON parse errors: ${stats.validationStats.jsonParseErrors.toLocaleString()}`); + console.log(` - No barcode: ${stats.validationStats.noBarcode.toLocaleString()}`); + } + const proceed = await askUploadPermission(stats); if (!proceed) { console.log("Upload skipped."); From 162c9e7349675fde809e6a986843d6d5f0447117 Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Tue, 30 Sep 2025 17:51:00 +0530 Subject: [PATCH 13/21] perf: optimize for speed with larger batches, parallel uploads, and better I/O --- local/off_ingest.ts | 48 +++++++++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/local/off_ingest.ts b/local/off_ingest.ts index 4bdfabe..0442a60 100644 --- a/local/off_ingest.ts +++ b/local/off_ingest.ts @@ -1,5 +1,6 @@ -// deno run -A local/off_ingest.ts +// deno run -A --unstable-kv local/off_ingest.ts // Environment: Copy .env.template to .env and fill in your values +// Performance: Use --unstable-kv for better memory management import { createClient } from "npm:@supabase/supabase-js@2.39.3"; @@ -45,8 +46,9 @@ type CacheRow = { const OFF_JSONL_GZ_URL = "https://static.openfoodfacts.org/data/openfoodfacts-products.jsonl.gz"; const OUTPUT_PATH = "local/off_inventory_cache.jsonl"; -const BATCH_UPLOAD_SIZE = 1000; -const BATCHES_PER_CONFIRMATION = 5; // Upload 5 batches (5000 rows) before asking for confirmation +const BATCH_UPLOAD_SIZE = 5000; // Increased from 1000 +const BATCHES_PER_CONFIRMATION = 2; // Upload 2 batches (10k rows) before asking for confirmation +const PARALLEL_BATCHES = 3; // Process multiple batches in parallel function mapIngredient(node: any): Ingredient { const item: Ingredient = { @@ -284,6 +286,12 @@ async function askUploadPermission(stats: { count: number; totalBytes: number }) return !!answer; } +async function uploadBatch(supabase: any, batch: any[], batchNumber: number): Promise { + const { error } = await supabase.from("inventory_cache").upsert(batch, { onConflict: "barcode" }); + if (error) throw error; + console.log(`āœ… Uploaded batch ${batchNumber} (${batch.length} rows)`); +} + async function uploadJsonlToSupabase(path: string) { const url = Deno.env.get("SUPABASE_URL") ?? ""; const key = Deno.env.get("SUPABASE_SECRET_KEY") ?? Deno.env.get("SUPABASE_SERVICE_ROLE_KEY") ?? ""; @@ -292,13 +300,14 @@ async function uploadJsonlToSupabase(path: string) { const file = await Deno.open(path, { read: true }); const decoder = new TextDecoder(); - const bufSize = 1024 * 1024; + const bufSize = 2 * 1024 * 1024; // Increased buffer size const buf = new Uint8Array(bufSize); let pending: any[] = []; let leftover = ""; let total = 0; let batchCount = 0; let uploadedBatches = 0; + let pendingUploads: Promise[] = []; try { while (true) { @@ -319,16 +328,27 @@ async function uploadJsonlToSupabase(path: string) { total++; if (pending.length >= BATCH_UPLOAD_SIZE) { - // Upload this batch - const { error } = await supabase.from("inventory_cache").upsert(pending, { onConflict: "barcode" }); - if (error) throw error; - + // Start parallel upload + const batchToUpload = [...pending]; batchCount++; + const uploadPromise = uploadBatch(supabase, batchToUpload, batchCount); + pendingUploads.push(uploadPromise); + + // Wait for uploads if we have too many pending + if (pendingUploads.length >= PARALLEL_BATCHES) { + await Promise.all(pendingUploads); + pendingUploads = []; + } + uploadedBatches++; - console.log(`āœ… Uploaded batch ${batchCount} (${pending.length} rows) - Total: ${total} rows`); // Check if we need confirmation if (uploadedBatches >= BATCHES_PER_CONFIRMATION) { + // Wait for all pending uploads before asking + if (pendingUploads.length > 0) { + await Promise.all(pendingUploads); + pendingUploads = []; + } console.log(`\nšŸ“Š Progress: ${total} rows uploaded in ${batchCount} batches`); const continueUpload = confirm(`Continue uploading? (${total} rows uploaded so far) [y/N]`); if (!continueUpload) { @@ -359,10 +379,14 @@ async function uploadJsonlToSupabase(path: string) { // Upload final batch if any if (pending.length) { - const { error } = await supabase.from("inventory_cache").upsert(pending, { onConflict: "barcode" }); - if (error) throw error; batchCount++; - console.log(`āœ… Uploaded final batch ${batchCount} (${pending.length} rows)`); + const uploadPromise = uploadBatch(supabase, pending, batchCount); + pendingUploads.push(uploadPromise); + } + + // Wait for all remaining uploads + if (pendingUploads.length > 0) { + await Promise.all(pendingUploads); } } finally { file.close(); From a2ac1b2b11298dd9636d460fa47d29bddbf4e2ca Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Wed, 1 Oct 2025 14:59:16 +0530 Subject: [PATCH 14/21] feat: organize OpenFoodFacts scripts and add inventory cache to gitignore - Move off_ingest.ts and off_upload_batch.ts to local/openfoodfacts/ folder - Add off_inventory_cache.jsonl to .gitignore (large generated file) - Fix parallel upload logic to prevent memory exhaustion - Add deduplication to handle duplicate barcodes in batches - Successfully uploaded 3M+ products to Supabase inventory_cache --- .gitignore | 1 + deno.lock | 76 +++ local/.gitignore | 3 +- local/off_ingest.ts | 441 ------------- local/openfoodfacts/off_ingest.ts | 826 ++++++++++++++++++++++++ local/openfoodfacts/off_upload_batch.ts | 56 ++ 6 files changed, 961 insertions(+), 442 deletions(-) delete mode 100644 local/off_ingest.ts create mode 100644 local/openfoodfacts/off_ingest.ts create mode 100644 local/openfoodfacts/off_upload_batch.ts diff --git a/.gitignore b/.gitignore index 508fd10..0ddfe2e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ local/finetuning/preferencevalidatordataset/.env local/.env +local/temp/ diff --git a/deno.lock b/deno.lock index dcfa245..7901036 100644 --- a/deno.lock +++ b/deno.lock @@ -1,17 +1,93 @@ { "version": "5", "specifiers": { + "npm:@supabase/supabase-js@2.39.3": "2.39.3", "npm:@types/node@*": "24.2.0" }, "npm": { + "@supabase/functions-js@2.4.5": { + "integrity": "sha512-v5GSqb9zbosquTo6gBwIiq7W9eQ7rE5QazsK/ezNiQXdCbY+bH8D9qEaBIkhVvX4ZRW5rP03gEfw5yw9tiq4EQ==", + "dependencies": [ + "@supabase/node-fetch" + ] + }, + "@supabase/gotrue-js@2.72.0": { + "integrity": "sha512-cJSFgvZhhTEyOLAwkadKAlx0zUOzUFSpMd/42iacF1C2pm4FLnc6ICHT0D/q2gPr/tz65tu0g0s51n4od82yEQ==", + "dependencies": [ + "@supabase/node-fetch" + ] + }, + "@supabase/node-fetch@2.6.15": { + "integrity": "sha512-1ibVeYUacxWYi9i0cf5efil6adJ9WRyZBLivgjs+AUpewx1F3xPi7gLgaASI2SmIQxPoCEjAsLAzKPgMJVgOUQ==", + "dependencies": [ + "whatwg-url" + ] + }, + "@supabase/postgrest-js@1.21.3": { + "integrity": "sha512-rg3DmmZQKEVCreXq6Am29hMVe1CzemXyIWVYyyua69y6XubfP+DzGfLxME/1uvdgwqdoaPbtjBDpEBhqxq1ZwA==", + "dependencies": [ + "@supabase/node-fetch" + ] + }, + "@supabase/realtime-js@2.15.4": { + "integrity": "sha512-e/FYIWjvQJHOCNACWehnKvg26zosju3694k0NMUNb+JGLdvHJzEa29ZVVLmawd2kvx4hdbv8mxSqfttRnH3+DA==", + "dependencies": [ + "@supabase/node-fetch", + "@types/phoenix", + "@types/ws", + "ws" + ] + }, + "@supabase/storage-js@2.11.0": { + "integrity": "sha512-Y+kx/wDgd4oasAgoAq0bsbQojwQ+ejIif8uczZ9qufRHWFLMU5cODT+ApHsSrDufqUcVKt+eyxtOXSkeh2v9ww==", + "dependencies": [ + "@supabase/node-fetch" + ] + }, + "@supabase/supabase-js@2.39.3": { + "integrity": "sha512-NoltJSaJNKDJNutO5sJPAAi5RIWrn1z2XH+ig1+cHDojT6BTN7TvZPNa3Kq3gFQWfO5H1N9El/bCTZJ3iFW2kQ==", + "dependencies": [ + "@supabase/functions-js", + "@supabase/gotrue-js", + "@supabase/node-fetch", + "@supabase/postgrest-js", + "@supabase/realtime-js", + "@supabase/storage-js" + ] + }, "@types/node@24.2.0": { "integrity": "sha512-3xyG3pMCq3oYCNg7/ZP+E1ooTaGB4cG8JWRsqqOYQdbWNY4zbaV0Ennrd7stjiJEFZCaybcIgpTjJWHRfBSIDw==", "dependencies": [ "undici-types" ] }, + "@types/phoenix@1.6.6": { + "integrity": "sha512-PIzZZlEppgrpoT2QgbnDU+MMzuR6BbCjllj0bM70lWoejMeNJAxCchxnv7J3XFkI8MpygtRpzXrIlmWUBclP5A==" + }, + "@types/ws@8.18.1": { + "integrity": "sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg==", + "dependencies": [ + "@types/node" + ] + }, + "tr46@0.0.3": { + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==" + }, "undici-types@7.10.0": { "integrity": "sha512-t5Fy/nfn+14LuOc2KNYg75vZqClpAiqscVvMygNnlsHBFpSXdJaYtXMcdNLpl/Qvc3P2cB3s6lOV51nqsFq4ag==" + }, + "webidl-conversions@3.0.1": { + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==" + }, + "whatwg-url@5.0.0": { + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "dependencies": [ + "tr46", + "webidl-conversions" + ] + }, + "ws@8.18.3": { + "integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==" } }, "redirects": { diff --git a/local/.gitignore b/local/.gitignore index 0275b13..6edb89a 100644 --- a/local/.gitignore +++ b/local/.gitignore @@ -1,2 +1,3 @@ supabase-service.json -datasets/ \ No newline at end of file +datasets/ +off_inventory_cache.jsonl \ No newline at end of file diff --git a/local/off_ingest.ts b/local/off_ingest.ts deleted file mode 100644 index 0442a60..0000000 --- a/local/off_ingest.ts +++ /dev/null @@ -1,441 +0,0 @@ -// deno run -A --unstable-kv local/off_ingest.ts -// Environment: Copy .env.template to .env and fill in your values -// Performance: Use --unstable-kv for better memory management - -import { createClient } from "npm:@supabase/supabase-js@2.39.3"; - -// Load environment variables from .env file -async function loadEnv() { - try { - const envText = await Deno.readTextFile("local/.env"); - const lines = envText.split("\n"); - for (const line of lines) { - const trimmed = line.trim(); - if (trimmed && !trimmed.startsWith("#")) { - const [key, ...valueParts] = trimmed.split("="); - if (key && valueParts.length > 0) { - const value = valueParts.join("=").trim(); - Deno.env.set(key.trim(), value); - } - } - } - } catch (error) { - console.warn("āš ļø Could not load .env file:", error.message); - console.warn(" Make sure to copy .env.template to .env and fill in your values"); - } -} - -type Ingredient = { - name: string; - vegan?: boolean; - vegetarian?: boolean; - ingredients?: Ingredient[]; -}; - -type Image = { url: string }; - -type CacheRow = { - barcode: string; - data_source: string; - brand?: string; - name?: string; - ingredients: Ingredient[]; - images: Image[]; - off_last_modified_t?: number; -}; - -const OFF_JSONL_GZ_URL = "https://static.openfoodfacts.org/data/openfoodfacts-products.jsonl.gz"; -const OUTPUT_PATH = "local/off_inventory_cache.jsonl"; -const BATCH_UPLOAD_SIZE = 5000; // Increased from 1000 -const BATCHES_PER_CONFIRMATION = 2; // Upload 2 batches (10k rows) before asking for confirmation -const PARALLEL_BATCHES = 3; // Process multiple batches in parallel - -function mapIngredient(node: any): Ingredient { - const item: Ingredient = { - name: typeof node?.text === "string" ? node.text : undefined as unknown as string, - vegan: node?.vegan, - vegetarian: node?.vegetarian, - ingredients: [], - }; - if (Array.isArray(node?.ingredients) && node.ingredients.length > 0) { - item.ingredients = node.ingredients.filter((x: any) => x && typeof x === "object").map(mapIngredient); - } - return item; -} - -function extractDisplayImageUrls(selectedImages: any): Image[] { - if (!selectedImages || typeof selectedImages !== "object") return []; - const urls: Image[] = []; - try { - for (const value of Object.values(selectedImages as Record)) { - const display = (value as any)?.display; - if (display && typeof display === "object") { - if (typeof display.en === "string" && display.en) { - urls.push({ url: display.en }); - } else { - for (const v of Object.values(display)) { - if (typeof v === "string" && v) { - urls.push({ url: v }); - break; - } - } - } - } - } - } catch (_) { - // ignore malformed structures - } - return urls; -} - -function mapToCacheRow(product: any): CacheRow | null { - const dataSource = "openfoodfacts/v3"; - - let barcode: string | undefined; - const code = product?.code; - if (typeof code === "string" && code.trim()) { - barcode = code.trim(); - } else if (typeof code === "number") { - barcode = String(code); - } else if (typeof product?._id === "string" && product._id.trim()) { - barcode = product._id.trim(); - } - if (!barcode) return null; - - let brand: string | undefined; - if (typeof product?.brand_owner === "string" && product.brand_owner.trim()) { - brand = product.brand_owner.trim(); - } else if (typeof product?.brands === "string" && product.brands.trim()) { - brand = product.brands.split(",")[0]?.trim(); - } - - let name: string | undefined; - if (typeof product?.product_name === "string" && product.product_name.trim()) { - name = product.product_name.trim(); - } else { - for (const [k, v] of Object.entries(product ?? {})) { - if (k.startsWith("product_name_") && typeof v === "string" && (v as string).trim()) { - name = (v as string).trim(); - break; - } - } - } - - let ingredients: Ingredient[] = []; - if (Array.isArray(product?.ingredients) && product.ingredients.length > 0) { - ingredients = product.ingredients.filter((x: any) => x && typeof x === "object").map(mapIngredient); - } - - const images = extractDisplayImageUrls(product?.selected_images); - const off_last_modified_t = typeof product?.last_modified_t === "number" ? product.last_modified_t : undefined; - - return { - barcode, - data_source: dataSource, - brand, - name, - ingredients, - images, - off_last_modified_t, - }; -} - -async function* iterLinesFromGzip(url: string): AsyncGenerator { - console.log("šŸ“„ Downloading Open Food Facts data..."); - const res = await fetch(url); - if (!res.body) throw new Error("No response body from OFF"); - - const contentLength = res.headers.get("content-length"); - const totalBytes = contentLength ? parseInt(contentLength) : 0; - console.log(`šŸ“¦ File size: ${totalBytes > 0 ? formatBytes(totalBytes) : "unknown"}`); - - const decompressed = res.body.pipeThrough(new DecompressionStream("gzip")); - const textStream = decompressed.pipeThrough(new TextDecoderStream()); - const reader = textStream.getReader(); - let buf = ""; - let lineCount = 0; - let lastProgressTime = Date.now(); - - try { - while (true) { - const { value, done } = await reader.read(); - if (done) break; - buf += value ?? ""; - let idx: number; - while ((idx = buf.indexOf("\n")) !== -1) { - const line = buf.slice(0, idx); - buf = buf.slice(idx + 1); - lineCount++; - - // Show progress every 50k lines or every 10 seconds - const now = Date.now(); - if (lineCount % 50000 === 0 || now - lastProgressTime > 10000) { - console.log(`šŸ“Š Processed ${lineCount.toLocaleString()} products...`); - lastProgressTime = now; - } - - yield line; - } - } - if (buf.length > 0) { - lineCount++; - yield buf; - } - } finally { - reader.releaseLock(); - } - console.log(`āœ… Download complete! Processed ${lineCount.toLocaleString()} products`); -} - -async function writeJsonl(rows: AsyncIterable<{ row: CacheRow; stats: any }>, outPath: string): Promise<{ count: number; totalBytes: number; nonEmpty: { brand: number; name: number; ingredients: number; images: number }; validationStats: any }> { - console.log("šŸ’¾ Writing transformed data to local file..."); - const file = await Deno.open(outPath, { create: true, write: true, truncate: true }); - const encoder = new TextEncoder(); - let count = 0; - let totalBytes = 0; - let nonBrand = 0, nonName = 0, nonIng = 0, nonImg = 0; - let lastProgressTime = Date.now(); - let lastStats: any = null; - - try { - for await (const { row, stats } of rows) { - count++; - if (row.brand) nonBrand++; - if (row.name) nonName++; - if (row.ingredients && row.ingredients.length) nonIng++; - if (row.images && row.images.length) nonImg++; - const json = JSON.stringify(row); - totalBytes += encoder.encode(json).byteLength; - await file.write(encoder.encode(json + "\n")); - lastStats = stats; - - // Show progress every 25k rows or every 5 seconds - const now = Date.now(); - if (count % 25000 === 0 || now - lastProgressTime > 5000) { - const validRate = ((lastStats.validProducts / lastStats.totalLines) * 100).toFixed(1); - const invalidRate = (((lastStats.emptyLines + lastStats.jsonParseErrors + lastStats.noBarcode) / lastStats.totalLines) * 100).toFixed(1); - console.log(`šŸ“ Written ${count.toLocaleString()} products (${validRate}% valid, ${invalidRate}% invalid)`); - lastProgressTime = now; - } - } - } finally { - file.close(); - } - - // Final validation statistics - if (lastStats) { - console.log(`\nšŸ“Š Validation Statistics:`); - console.log(` Total lines processed: ${lastStats.totalLines.toLocaleString()}`); - console.log(` Valid products: ${lastStats.validProducts.toLocaleString()} (${((lastStats.validProducts / lastStats.totalLines) * 100).toFixed(1)}%)`); - console.log(` Invalid products: ${(lastStats.emptyLines + lastStats.jsonParseErrors + lastStats.noBarcode).toLocaleString()} (${(((lastStats.emptyLines + lastStats.jsonParseErrors + lastStats.noBarcode) / lastStats.totalLines) * 100).toFixed(1)}%)`); - console.log(` - Empty lines: ${lastStats.emptyLines.toLocaleString()}`); - console.log(` - JSON parse errors: ${lastStats.jsonParseErrors.toLocaleString()}`); - console.log(` - No barcode: ${lastStats.noBarcode.toLocaleString()}`); - } - - console.log(`āœ… Local file complete! ${count.toLocaleString()} products written`); - return { count, totalBytes, nonEmpty: { brand: nonBrand, name: nonName, ingredients: nonIng, images: nonImg }, validationStats: lastStats }; -} - -async function* projectRows(lines: AsyncIterable): AsyncGenerator<{ row: CacheRow; stats: { totalLines: number; emptyLines: number; jsonParseErrors: number; noBarcode: number; validProducts: number } }> { - let totalLines = 0; - let emptyLines = 0; - let jsonParseErrors = 0; - let noBarcode = 0; - let validProducts = 0; - - for await (const line of lines) { - totalLines++; - const trimmed = line.trim(); - if (!trimmed) { - emptyLines++; - continue; - } - - try { - const product = JSON.parse(trimmed); - const row = mapToCacheRow(product); - if (row && row.barcode) { - validProducts++; - yield { row, stats: { totalLines, emptyLines, jsonParseErrors, noBarcode, validProducts } }; - } else { - noBarcode++; - } - } catch (_) { - jsonParseErrors++; - } - } -} - -function formatBytes(bytes: number): string { - const units = ["B", "KB", "MB", "GB", "TB"] as const; - let i = 0; - let n = bytes; - while (n >= 1024 && i < units.length - 1) { - n /= 1024; - i++; - } - return `${n.toFixed(2)} ${units[i]}`; -} - -async function askUploadPermission(stats: { count: number; totalBytes: number }): Promise { - console.log("\nSummary:"); - console.log(` Rows: ${stats.count}`); - console.log(` Payload size (JSON only): ${formatBytes(stats.totalBytes)} (~${Math.round(stats.totalBytes / Math.max(1, stats.count))} B/row)`); - const answer = confirm("Upload to Supabase inventory_cache? This can take a long time. (y/N)"); - return !!answer; -} - -async function uploadBatch(supabase: any, batch: any[], batchNumber: number): Promise { - const { error } = await supabase.from("inventory_cache").upsert(batch, { onConflict: "barcode" }); - if (error) throw error; - console.log(`āœ… Uploaded batch ${batchNumber} (${batch.length} rows)`); -} - -async function uploadJsonlToSupabase(path: string) { - const url = Deno.env.get("SUPABASE_URL") ?? ""; - const key = Deno.env.get("SUPABASE_SECRET_KEY") ?? Deno.env.get("SUPABASE_SERVICE_ROLE_KEY") ?? ""; - if (!url || !key) throw new Error("SUPABASE_URL and SUPABASE_SECRET_KEY (or SUPABASE_SERVICE_ROLE_KEY for legacy) must be set in environment"); - const supabase = createClient(url, key, { auth: { persistSession: false } }); - - const file = await Deno.open(path, { read: true }); - const decoder = new TextDecoder(); - const bufSize = 2 * 1024 * 1024; // Increased buffer size - const buf = new Uint8Array(bufSize); - let pending: any[] = []; - let leftover = ""; - let total = 0; - let batchCount = 0; - let uploadedBatches = 0; - let pendingUploads: Promise[] = []; - - try { - while (true) { - const read = await file.read(buf); - if (read === null) break; - const chunk = decoder.decode(buf.subarray(0, read)); - let data = leftover + chunk; - let idx: number; - while ((idx = data.indexOf("\n")) !== -1) { - const line = data.slice(0, idx); - data = data.slice(idx + 1); - if (!line) continue; - try { - const row = JSON.parse(line); - // Set last_refreshed_at during upsert - row.last_refreshed_at = new Date().toISOString(); - pending.push(row); - total++; - - if (pending.length >= BATCH_UPLOAD_SIZE) { - // Start parallel upload - const batchToUpload = [...pending]; - batchCount++; - const uploadPromise = uploadBatch(supabase, batchToUpload, batchCount); - pendingUploads.push(uploadPromise); - - // Wait for uploads if we have too many pending - if (pendingUploads.length >= PARALLEL_BATCHES) { - await Promise.all(pendingUploads); - pendingUploads = []; - } - - uploadedBatches++; - - // Check if we need confirmation - if (uploadedBatches >= BATCHES_PER_CONFIRMATION) { - // Wait for all pending uploads before asking - if (pendingUploads.length > 0) { - await Promise.all(pendingUploads); - pendingUploads = []; - } - console.log(`\nšŸ“Š Progress: ${total} rows uploaded in ${batchCount} batches`); - const continueUpload = confirm(`Continue uploading? (${total} rows uploaded so far) [y/N]`); - if (!continueUpload) { - console.log("āŒ Upload cancelled by user"); - return; - } - uploadedBatches = 0; // Reset counter - } - - pending = []; - } - } catch (_) { - // skip invalid JSON - } - } - leftover = data; - } - - // Handle remaining data - if (leftover.trim()) { - try { - const row = JSON.parse(leftover.trim()); - row.last_refreshed_at = new Date().toISOString(); - pending.push(row); - total++; - } catch (_) {} - } - - // Upload final batch if any - if (pending.length) { - batchCount++; - const uploadPromise = uploadBatch(supabase, pending, batchCount); - pendingUploads.push(uploadPromise); - } - - // Wait for all remaining uploads - if (pendingUploads.length > 0) { - await Promise.all(pendingUploads); - } - } finally { - file.close(); - } - - console.log(`\nšŸŽ‰ Upload complete! Total: ${total} rows uploaded in ${batchCount} batches`); -} - -async function main() { - // Load environment variables from .env file - await loadEnv(); - - console.log("Downloading OFF JSONL.gz and streaming transform..."); - const rows = projectRows(iterLinesFromGzip(OFF_JSONL_GZ_URL)); - const stats = await writeJsonl(rows, OUTPUT_PATH); - console.log("\nField coverage (non-empty counts):"); - console.log(` brand: ${stats.nonEmpty.brand}`); - console.log(` name: ${stats.nonEmpty.name}`); - console.log(` ingredients: ${stats.nonEmpty.ingredients}`); - console.log(` images: ${stats.nonEmpty.images}`); - - // Show validation summary - if (stats.validationStats) { - console.log("\nšŸ“Š Data Quality Summary:"); - const validRate = ((stats.validationStats.validProducts / stats.validationStats.totalLines) * 100).toFixed(1); - console.log(` Success rate: ${validRate}% (${stats.validationStats.validProducts.toLocaleString()} valid out of ${stats.validationStats.totalLines.toLocaleString()} total)`); - console.log(` Invalid breakdown:`); - console.log(` - Empty lines: ${stats.validationStats.emptyLines.toLocaleString()}`); - console.log(` - JSON parse errors: ${stats.validationStats.jsonParseErrors.toLocaleString()}`); - console.log(` - No barcode: ${stats.validationStats.noBarcode.toLocaleString()}`); - } - - const proceed = await askUploadPermission(stats); - if (!proceed) { - console.log("Upload skipped."); - return; - } - console.log("\nUploading to Supabase (batched upserts)..."); - const start = Date.now(); - await uploadJsonlToSupabase(OUTPUT_PATH); - const elapsed = (Date.now() - start) / 1000; - console.log(`Done in ${elapsed.toFixed(1)}s.`); -} - -if (import.meta.main) { - await main().catch((err) => { - console.error("Error:", err); - Deno.exit(1); - }); -} - - diff --git a/local/openfoodfacts/off_ingest.ts b/local/openfoodfacts/off_ingest.ts new file mode 100644 index 0000000..0cd8263 --- /dev/null +++ b/local/openfoodfacts/off_ingest.ts @@ -0,0 +1,826 @@ +// deno run -A --unstable-kv local/openfoodfacts/off_ingest.ts +// Environment: Copy .env.template to .env and fill in your values +// Performance: Use --unstable-kv for better memory management + +// Load environment variables from .env file +async function loadEnv() { + try { + const envText = await Deno.readTextFile("local/.env"); + const lines = envText.split("\n"); + for (const line of lines) { + const trimmed = line.trim(); + if (trimmed && !trimmed.startsWith("#")) { + const [key, ...valueParts] = trimmed.split("="); + if (key && valueParts.length > 0) { + const value = valueParts.join("=").trim(); + Deno.env.set(key.trim(), value); + } + } + } + } catch (error) { + console.warn("āš ļø Could not load .env file:", error.message); + console.warn(" Make sure to copy .env.template to .env and fill in your values"); + } +} + +type Ingredient = { + name: string; + vegan?: boolean; + vegetarian?: boolean; + ingredients?: Ingredient[]; +}; + +type Image = { + url: string; + resolution?: string; + width?: number; + height?: number; +}; + +type CacheRow = { + barcode: string; + data_source: string; + brand?: string; + name?: string; + ingredients: Ingredient[]; + images: Image[]; + off_last_modified_t?: number; +}; + +const OFF_JSONL_GZ_URL = "https://static.openfoodfacts.org/data/openfoodfacts-products.jsonl.gz"; +const OUTPUT_PATH = "local/off_inventory_cache.jsonl"; +const BATCH_UPLOAD_SIZE = 1000; // Products per batch +const BATCHES_PER_CONFIRMATION = 500; // Upload 500 batches (500k rows) before asking for confirmation +const PARALLEL_BATCHES = 10; // Upload 10 batches in parallel = 10,000 products per batch group +const DELAY_BETWEEN_BATCH_GROUPS_MS = 500; // 500ms delay between batch groups to avoid rate limiting +const SAMPLE_SIZE = 10000; // Sample size for size estimation +const SAMPLE_LINES = 100000; // Only process first 100k lines for sampling + +function mapIngredient(node: any): Ingredient { + const item: Ingredient = { + name: typeof node?.text === "string" ? node.text : undefined as unknown as string, + vegan: node?.vegan, + vegetarian: node?.vegetarian, + ingredients: [], + }; + if (Array.isArray(node?.ingredients) && node.ingredients.length > 0) { + item.ingredients = node.ingredients.filter((x: any) => x && typeof x === "object").map(mapIngredient); + } + return item; +} + +function isValidImageUrl(url: string): boolean { + try { + const parsedUrl = new URL(url); + return parsedUrl.protocol === 'https:' && + parsedUrl.hostname === 'static.openfoodfacts.org' && + url.includes('/images/products/') && + url.endsWith('.jpg'); + } catch { + return false; + } +} + +function extractDisplayImageUrls(images: any): Image[] { + if (!images || typeof images !== "object") { + return []; + } + + const urls: Image[] = []; + const processedImages = new Set(); // Track processed image IDs to avoid duplicates + + try { + // Open Food Facts image structure: images contains both numeric keys (1,2,3,4) and language-specific keys (front_en, ingredients_fr, etc.) + // Language-specific keys reference numeric images via imgid + + // First, collect all language-specific front images + const languageKeys = ['front_en', 'front_fr', 'front_de', 'front_es', 'front_it', 'front_pt', 'front_nl', 'front_sv', 'front_da', 'front_no', 'front_fi']; + + for (const langKey of languageKeys) { + const imageRef = images[langKey]; + if (imageRef && typeof imageRef === "object" && imageRef.imgid) { + const imgId = imageRef.imgid; + if (processedImages.has(imgId)) continue; // Skip if already processed + + const imageData = images[imgId]; + if (imageData && typeof imageData === "object" && imageData.sizes) { + const sizes = imageData.sizes; + + // Collect all available sizes for this image, grouped by resolution + const imageUrls: { url: string; resolution: string; width: number; height: number }[] = []; + + if (sizes.full && sizes.full.w && sizes.full.h) { + const url = `https://static.openfoodfacts.org/images/products/${imgId}/front.${sizes.full.w}x${sizes.full.h}.jpg`; + if (isValidImageUrl(url)) { + imageUrls.push({ url, resolution: 'full', width: sizes.full.w, height: sizes.full.h }); + } + } + + if (sizes["400"] && sizes["400"].w && sizes["400"].h) { + const url = `https://static.openfoodfacts.org/images/products/${imgId}/front.400x${sizes["400"].h}.jpg`; + if (isValidImageUrl(url)) { + imageUrls.push({ url, resolution: '400px', width: sizes["400"].w, height: sizes["400"].h }); + } + } + + if (sizes["200"] && sizes["200"].w && sizes["200"].h) { + const url = `https://static.openfoodfacts.org/images/products/${imgId}/front.200x${sizes["200"].h}.jpg`; + if (isValidImageUrl(url)) { + imageUrls.push({ url, resolution: '200px', width: sizes["200"].w, height: sizes["200"].h }); + } + } + + if (sizes["100"] && sizes["100"].w && sizes["100"].h) { + const url = `https://static.openfoodfacts.org/images/products/${imgId}/front.100x${sizes["100"].h}.jpg`; + if (isValidImageUrl(url)) { + imageUrls.push({ url, resolution: '100px', width: sizes["100"].w, height: sizes["100"].h }); + } + } + + // Add all valid URLs for this image + for (const img of imageUrls) { + urls.push({ + url: img.url, + resolution: img.resolution, + width: img.width, + height: img.height + }); + } + + processedImages.add(imgId); + } + } + } + + // If no language-specific front images found, try any numeric image + if (urls.length === 0) { + for (const [key, imageData] of Object.entries(images)) { + if (/^\d+$/.test(key) && imageData && typeof imageData === "object" && imageData.sizes) { + const sizes = imageData.sizes; + + // Collect all available sizes for this image + if (sizes.full && sizes.full.w && sizes.full.h) { + const url = `https://static.openfoodfacts.org/images/products/${key}/front.${sizes.full.w}x${sizes.full.h}.jpg`; + if (isValidImageUrl(url)) { + urls.push({ + url, + resolution: 'full', + width: sizes.full.w, + height: sizes.full.h + }); + } + } + + if (sizes["400"] && sizes["400"].w && sizes["400"].h) { + const url = `https://static.openfoodfacts.org/images/products/${key}/front.400x${sizes["400"].h}.jpg`; + if (isValidImageUrl(url)) { + urls.push({ + url, + resolution: '400px', + width: sizes["400"].w, + height: sizes["400"].h + }); + } + } + + if (urls.length > 0) break; // Stop after finding the first valid image + } + } + } + } catch (error) { + // ignore malformed structures + } + + return urls; +} + +function mapToCacheRow(product: any): CacheRow | null { + const dataSource = "openfoodfacts/v3"; + + let barcode: string | undefined; + const code = product?.code; + if (typeof code === "string" && code.trim()) { + barcode = code.trim(); + } else if (typeof code === "number") { + barcode = String(code); + } else if (typeof product?._id === "string" && product._id.trim()) { + barcode = product._id.trim(); + } + if (!barcode) return null; + + let brand: string | undefined; + if (typeof product?.brand_owner === "string" && product.brand_owner.trim()) { + brand = product.brand_owner.trim(); + } else if (typeof product?.brands === "string" && product.brands.trim()) { + brand = product.brands.split(",")[0]?.trim(); + } + + let name: string | undefined; + if (typeof product?.product_name === "string" && product.product_name.trim()) { + name = product.product_name.trim(); + } else { + for (const [k, v] of Object.entries(product ?? {})) { + if (k.startsWith("product_name_") && typeof v === "string" && (v as string).trim()) { + name = (v as string).trim(); + break; + } + } + } + + let ingredients: Ingredient[] = []; + if (Array.isArray(product?.ingredients) && product.ingredients.length > 0) { + ingredients = product.ingredients.filter((x: any) => x && typeof x === "object").map(mapIngredient); + } + + const images = extractDisplayImageUrls(product?.images); + const off_last_modified_t = typeof product?.last_modified_t === "number" ? product.last_modified_t : undefined; + + + + return { + barcode, + data_source: dataSource, + brand, + name, + ingredients, + images, + off_last_modified_t, + }; +} + +async function* iterLinesFromGzip(url: string, showProgress: boolean = true): AsyncGenerator { + if (showProgress) { + console.log("šŸ“„ Downloading Open Food Facts data..."); + } + const res = await fetch(url); + if (!res.body) throw new Error("No response body from OFF"); + + const contentLength = res.headers.get("content-length"); + const totalBytes = contentLength ? parseInt(contentLength) : 0; + if (showProgress) { + console.log(`šŸ“¦ File size: ${totalBytes > 0 ? formatBytes(totalBytes) : "unknown"}`); + } + + const decompressed = res.body.pipeThrough(new DecompressionStream("gzip")); + const textStream = decompressed.pipeThrough(new TextDecoderStream()); + const reader = textStream.getReader(); + let buf = ""; + let lineCount = 0; + let lastProgressTime = Date.now(); + + try { + while (true) { + const { value, done } = await reader.read(); + if (done) break; + buf += value ?? ""; + let idx: number; + while ((idx = buf.indexOf("\n")) !== -1) { + const line = buf.slice(0, idx); + buf = buf.slice(idx + 1); + lineCount++; + + // Show progress every 50k lines or every 10 seconds (only if showProgress is true) + if (showProgress) { + const now = Date.now(); + if (lineCount % 50000 === 0 || now - lastProgressTime > 10000) { + console.log(`šŸ“Š Processed ${lineCount.toLocaleString()} products...`); + lastProgressTime = now; + } + } + + yield line; + } + } + if (buf.length > 0) { + lineCount++; + yield buf; + } + } finally { + reader.releaseLock(); + } + if (showProgress) { + console.log(`āœ… Download complete! Processed ${lineCount.toLocaleString()} products`); + } +} + +async function writeJsonl(rows: AsyncIterable<{ row: CacheRow; stats: any }>, outPath: string): Promise<{ count: number; totalBytes: number; nonEmpty: { brand: number; name: number; ingredients: number; images: number }; validationStats: any }> { + console.log("šŸ’¾ Writing transformed data to local file..."); + const file = await Deno.open(outPath, { create: true, write: true, truncate: true }); + const encoder = new TextEncoder(); + let count = 0; + let totalBytes = 0; + let nonBrand = 0, nonName = 0, nonIng = 0, nonImg = 0; + let lastProgressTime = Date.now(); + let lastStats: any = null; + + try { + for await (const { row, stats } of rows) { + count++; + if (row.brand) nonBrand++; + if (row.name) nonName++; + if (row.ingredients && row.ingredients.length) nonIng++; + if (row.images && row.images.length) nonImg++; + const json = JSON.stringify(row); + totalBytes += encoder.encode(json).byteLength; + await file.write(encoder.encode(json + "\n")); + lastStats = stats; + + // Show progress every 25k rows or every 5 seconds + const now = Date.now(); + if (count % 25000 === 0 || now - lastProgressTime > 5000) { + const validRate = ((lastStats.validProducts / lastStats.totalLines) * 100).toFixed(1); + const invalidRate = (((lastStats.emptyLines + lastStats.jsonParseErrors + lastStats.noBarcode) / lastStats.totalLines) * 100).toFixed(1); + console.log(`šŸ“ Written ${count.toLocaleString()} products (${validRate}% valid, ${invalidRate}% invalid)`); + lastProgressTime = now; + } + } + } finally { + file.close(); + } + + // Final validation statistics + if (lastStats) { + console.log(`\nšŸ“Š Validation Statistics:`); + console.log(` Total lines processed: ${lastStats.totalLines.toLocaleString()}`); + console.log(` Valid products: ${lastStats.validProducts.toLocaleString()} (${((lastStats.validProducts / lastStats.totalLines) * 100).toFixed(1)}%)`); + console.log(` Invalid products: ${(lastStats.emptyLines + lastStats.jsonParseErrors + lastStats.noBarcode).toLocaleString()} (${(((lastStats.emptyLines + lastStats.jsonParseErrors + lastStats.noBarcode) / lastStats.totalLines) * 100).toFixed(1)}%)`); + console.log(` - Empty lines: ${lastStats.emptyLines.toLocaleString()}`); + console.log(` - JSON parse errors: ${lastStats.jsonParseErrors.toLocaleString()}`); + console.log(` - No barcode: ${lastStats.noBarcode.toLocaleString()}`); + } + + console.log(`āœ… Local file complete! ${count.toLocaleString()} products written`); + return { count, totalBytes, nonEmpty: { brand: nonBrand, name: nonName, ingredients: nonIng, images: nonImg }, validationStats: lastStats }; +} + +async function* projectRows(lines: AsyncIterable): AsyncGenerator<{ row: CacheRow; stats: { totalLines: number; emptyLines: number; jsonParseErrors: number; noBarcode: number; validProducts: number } }> { + let totalLines = 0; + let emptyLines = 0; + let jsonParseErrors = 0; + let noBarcode = 0; + let validProducts = 0; + + for await (const line of lines) { + totalLines++; + + const trimmed = line.trim(); + if (!trimmed) { + emptyLines++; + continue; + } + + try { + const product = JSON.parse(trimmed); + const row = mapToCacheRow(product); + if (row && row.barcode) { + validProducts++; + + // Progress reporting every 50k products + if (validProducts % 50000 === 0) { + const validRate = ((validProducts / totalLines) * 100).toFixed(1); + const invalidRate = (((emptyLines + jsonParseErrors + noBarcode) / totalLines) * 100).toFixed(1); + console.log(`šŸ“Š Processed ${totalLines.toLocaleString()} lines, found ${validProducts.toLocaleString()} valid products (${validRate}% valid, ${invalidRate}% invalid)...`); + } + + yield { row, stats: { totalLines, emptyLines, jsonParseErrors, noBarcode, validProducts } }; + } else { + noBarcode++; + } + } catch (_) { + jsonParseErrors++; + } + } +} + +function formatBytes(bytes: number): string { + const units = ["B", "KB", "MB", "GB", "TB"] as const; + let i = 0; + let n = bytes; + while (n >= 1024 && i < units.length - 1) { + n /= 1024; + i++; + } + return `${n.toFixed(2)} ${units[i]}`; +} + +function estimateDatabaseSize(sampleRows: CacheRow[], totalProducts: number): { + avgRowSize: number; + estimatedTableSize: number; + estimatedIndexSize: number; + estimatedTotalSize: number; +} { + if (sampleRows.length === 0) { + return { avgRowSize: 0, estimatedTableSize: 0, estimatedIndexSize: 0, estimatedTotalSize: 0 }; + } + + // Calculate average row size from sample + const sampleSizes = sampleRows.map(row => { + const json = JSON.stringify(row); + return new TextEncoder().encode(json).byteLength; + }); + + const avgRowSize = sampleSizes.reduce((sum, size) => sum + size, 0) / sampleSizes.length; + + // Estimate table size (data only) + const estimatedTableSize = avgRowSize * totalProducts; + + // Estimate index size (barcode PK + other indexes) + // Barcode index: ~20 bytes per row + overhead + const barcodeIndexSize = (20 + 8) * totalProducts; // 20 bytes key + 8 bytes pointer + const otherIndexSize = totalProducts * 16; // Additional indexes + const estimatedIndexSize = barcodeIndexSize + otherIndexSize; + + // PostgreSQL overhead (20-30% for metadata, TOAST, etc.) + const overhead = 0.25; + const estimatedTotalSize = (estimatedTableSize + estimatedIndexSize) * (1 + overhead); + + return { + avgRowSize: Math.round(avgRowSize), + estimatedTableSize: Math.round(estimatedTableSize), + estimatedIndexSize: Math.round(estimatedIndexSize), + estimatedTotalSize: Math.round(estimatedTotalSize) + }; +} + +async function sampleProductsForSizeEstimation(lines: AsyncIterable, sampleSize: number, maxLines: number): Promise<{ sampleRows: CacheRow[]; estimatedTotalValidProducts: number; sampleLines: number }> { + console.log(`šŸ“Š Sampling ${sampleSize.toLocaleString()} products from first ${maxLines.toLocaleString()} lines for size estimation...`); + const sampleRows: CacheRow[] = []; + let sampleLines = 0; + let validProductsInSample = 0; + let sampled = 0; + let lastProgressTime = Date.now(); + + for await (const line of lines) { + sampleLines++; + const trimmed = line.trim(); + if (!trimmed) continue; + + try { + const product = JSON.parse(trimmed); + const row = mapToCacheRow(product); + if (row && row.barcode) { + validProductsInSample++; + if (sampled < sampleSize) { + sampleRows.push(row); + sampled++; + } + } + } catch (_) { + // skip invalid lines + } + + // Show progress every 10k lines or every 5 seconds + const now = Date.now(); + if (sampleLines % 10000 === 0 || now - lastProgressTime > 5000) { + console.log(`šŸ“Š Scanned ${sampleLines.toLocaleString()} lines, found ${validProductsInSample.toLocaleString()} valid products...`); + lastProgressTime = now; + } + + // Stop after processing maxLines + if (sampleLines >= maxLines) break; + } + + // Estimate total valid products based on sample ratio + const validRatio = validProductsInSample / sampleLines; + const estimatedTotalValidProducts = Math.round(validRatio * 4046118); // Known total lines from earlier + + console.log(`āœ… Sampled ${sampled.toLocaleString()} products from ${validProductsInSample.toLocaleString()} valid products in ${sampleLines.toLocaleString()} lines`); + console.log(`šŸ“Š Estimated total valid products: ${estimatedTotalValidProducts.toLocaleString()} (${(validRatio * 100).toFixed(1)}% valid rate)`); + + return { sampleRows, estimatedTotalValidProducts, sampleLines }; +} + +async function askUploadPermission(stats: { count: number; totalBytes: number; dbEstimate?: any }): Promise { + console.log("\nSummary:"); + console.log(` Rows: ${stats.count}`); + console.log(` Payload size (JSON only): ${formatBytes(stats.totalBytes)} (~${Math.round(stats.totalBytes / Math.max(1, stats.count))} B/row)`); + + if (stats.dbEstimate) { + console.log("\nšŸ“Š Database Size Estimate:"); + console.log(` Average row size: ${formatBytes(stats.dbEstimate.avgRowSize)}`); + console.log(` Table data size: ${formatBytes(stats.dbEstimate.estimatedTableSize)}`); + console.log(` Index size: ${formatBytes(stats.dbEstimate.estimatedIndexSize)}`); + console.log(` Total estimated size: ${formatBytes(stats.dbEstimate.estimatedTotalSize)}`); + } + + const answer = confirm("Upload to Supabase inventory_cache? This can take a long time. (y/N)"); + return !!answer; +} + + +async function uploadJsonlToSupabase(path: string) { + console.log("šŸ“¤ Starting upload to Supabase..."); + + // Load env for Supabase connection + const url = Deno.env.get("SUPABASE_URL") ?? ""; + const key = Deno.env.get("SUPABASE_SECRET_KEY") ?? Deno.env.get("SUPABASE_SERVICE_ROLE_KEY") ?? ""; + + if (!url || !key) { + console.error("āŒ SUPABASE_URL and SUPABASE_SECRET_KEY must be set"); + Deno.exit(1); + } + + const { createClient } = await import("npm:@supabase/supabase-js@2.39.3"); + const supabase = createClient(url, key, { auth: { persistSession: false } }); + + console.log("šŸ” Opening file:", path); + const file = await Deno.open(path, { read: true }); + console.log("āœ… File opened successfully"); + + const decoder = new TextDecoder(); + const bufSize = 64 * 1024; + const buf = new Uint8Array(bufSize); + let pending: any[] = []; + let leftover = ""; + let total = 0; + let batchCount = 0; + let uploadedBatches = 0; + let parallelUploads: Promise[] = []; // Track parallel uploads + + // Progress tracking + const startTime = Date.now(); + const progressInterval = setInterval(() => { + const elapsed = (Date.now() - startTime) / 1000; + const rate = total / elapsed; + console.log(`šŸ“Š Progress: ${total} products processed, ${batchCount} batches uploaded (${rate.toFixed(0)} products/sec)`); + }, 10000); // Every 10 seconds + + try { + while (true) { + const read = await file.read(buf); + if (read === null) { + break; + } + + const chunk = decoder.decode(buf.subarray(0, read)); + let data = leftover + chunk; + let idx: number; + let linesInChunk = 0; + while ((idx = data.indexOf("\n")) !== -1) { + linesInChunk++; + const line = data.slice(0, idx); + data = data.slice(idx + 1); + if (!line) continue; + + try { + const row = JSON.parse(line); + row.last_refreshed_at = new Date().toISOString(); + pending.push(row); + total++; + + if (pending.length >= BATCH_UPLOAD_SIZE) { + batchCount++; + const currentBatchNum = batchCount; + + // Deduplicate by barcode (keep last occurrence) to avoid "ON CONFLICT DO UPDATE command cannot affect row a second time" error + const deduped = new Map(); + for (const row of pending) { + deduped.set(row.barcode, row); + } + + // Create batch for upload + const batch = Array.from(deduped.values()).map(row => ({ + ...row, + last_refreshed_at: new Date().toISOString() + })); + + // Log if duplicates were found + const duplicateCount = pending.length - batch.length; + if (duplicateCount > 0) { + console.log(`āš ļø Removed ${duplicateCount} duplicate barcodes from batch ${currentBatchNum}`); + } + + // Clear pending immediately + pending = []; + uploadedBatches++; + + // Wait if we've reached the parallel limit BEFORE creating new promise + if (parallelUploads.length >= PARALLEL_BATCHES) { + console.log(`ā³ Waiting for ${parallelUploads.length} parallel uploads to complete...`); + await Promise.all(parallelUploads); + parallelUploads = []; + console.log(`āœ… Completed batch group at ${batchCount} batches`); + + // Add delay to avoid rate limiting + if (DELAY_BETWEEN_BATCH_GROUPS_MS > 0) { + await new Promise(resolve => setTimeout(resolve, DELAY_BETWEEN_BATCH_GROUPS_MS)); + } + } + + // Now create and add the upload promise + const uploadPromise = (async () => { + try { + const { error } = await supabase + .from('inventory_cache') + .upsert(batch, { onConflict: 'barcode' }); + + if (error) { + console.error(`āŒ Batch ${currentBatchNum} failed:`, error.message); + throw error; + } + + // Only log every 10 batches + if (currentBatchNum % 10 === 0) { + console.log(`āœ… Uploaded batch ${currentBatchNum} (${batch.length} rows)`); + } + } catch (error) { + console.error(`āŒ Upload error at batch ${currentBatchNum}:`, error); + throw error; + } + })(); + + parallelUploads.push(uploadPromise); + + // Check if we need confirmation (every 500k products) + if (uploadedBatches >= BATCHES_PER_CONFIRMATION) { + // Wait for any pending uploads before asking + if (parallelUploads.length > 0) { + await Promise.all(parallelUploads); + parallelUploads = []; + } + + console.log(`\nšŸ“Š Checkpoint: ${total.toLocaleString()} products uploaded in ${batchCount} batches`); + console.log(` (Uploaded ${(uploadedBatches * BATCH_UPLOAD_SIZE).toLocaleString()} products since last checkpoint)`); + const continueUpload = confirm(`Continue uploading next 500k products? [y/N]`); + if (!continueUpload) { + console.log("āŒ Upload cancelled by user"); + return; + } + uploadedBatches = 0; + console.log("āœ… Continuing upload...\n"); + } + } + } catch (error) { + console.error(`āŒ JSON parse error on line:`, error.message); + // skip invalid JSON + } + } + leftover = data; + } + + // Handle remaining data + if (leftover.trim()) { + try { + const row = JSON.parse(leftover.trim()); + row.last_refreshed_at = new Date().toISOString(); + pending.push(row); + total++; + } catch (_) { + // skip invalid JSON + } + } + + // Upload final batch if there's pending data + if (pending.length > 0) { + batchCount++; + const currentBatchNum = batchCount; + + // Deduplicate by barcode (keep last occurrence) + const deduped = new Map(); + for (const row of pending) { + deduped.set(row.barcode, row); + } + + const batch = Array.from(deduped.values()).map(row => ({ + ...row, + last_refreshed_at: new Date().toISOString() + })); + + const duplicateCount = pending.length - batch.length; + if (duplicateCount > 0) { + console.log(`āš ļø Removed ${duplicateCount} duplicate barcodes from final batch ${currentBatchNum}`); + } + + const uploadPromise = (async () => { + try { + const { error } = await supabase + .from('inventory_cache') + .upsert(batch, { onConflict: 'barcode' }); + + if (error) { + console.error(`āŒ Final batch ${currentBatchNum} failed:`, error.message); + throw error; + } + + console.log(`āœ… Uploaded final batch ${currentBatchNum} (${batch.length} rows)`); + } catch (error) { + console.error(`āŒ Upload error at final batch ${currentBatchNum}:`, error); + throw error; + } + })(); + + parallelUploads.push(uploadPromise); + } + + // Wait for all remaining parallel uploads to complete + if (parallelUploads.length > 0) { + console.log(`ā³ Waiting for final ${parallelUploads.length} uploads to complete...`); + await Promise.all(parallelUploads); + console.log(`āœ… All uploads completed!`); + } + + console.log(`āœ… Upload complete! ${total} rows processed in ${batchCount} batches`); + + } finally { + file.close(); + clearInterval(progressInterval); + } +} + +async function main() { + // Parse command line arguments + const args = Deno.args; + const skipDownload = args.includes('--upload-only') || args.includes('-u'); + const showHelp = args.includes('--help') || args.includes('-h'); + + if (showHelp) { + console.log(` +Usage: deno run -A --unstable-kv local/openfoodfacts/off_ingest.ts [options] + +Options: + --upload-only, -u Skip download and processing, go straight to upload + --help, -h Show this help message + +Examples: + deno run -A --unstable-kv local/openfoodfacts/off_ingest.ts # Full process + deno run -A --unstable-kv local/openfoodfacts/off_ingest.ts --upload-only # Upload only + deno run -A --unstable-kv local/openfoodfacts/off_ingest.ts -u # Upload only (short) + `); + return; + } + + if (skipDownload) { + console.log("šŸš€ Starting in upload-only mode (skipping download and processing)..."); + } + + // Load environment variables from .env file + await loadEnv(); + + if (!skipDownload) { + console.log("Downloading OFF JSONL.gz and streaming transform..."); + + // First pass: Sample products for size estimation (fast) + const lines = iterLinesFromGzip(OFF_JSONL_GZ_URL, false); + const { sampleRows, estimatedTotalValidProducts } = await sampleProductsForSizeEstimation(lines, SAMPLE_SIZE, SAMPLE_LINES); + + // Estimate database size for the entire dataset + const dbEstimate = estimateDatabaseSize(sampleRows, estimatedTotalValidProducts); + console.log("\nšŸ“Š Database Size Estimate (Full Dataset):"); + console.log(` Estimated total valid products: ${estimatedTotalValidProducts.toLocaleString()}`); + console.log(` Average row size: ${formatBytes(dbEstimate.avgRowSize)}`); + console.log(` Table data size: ${formatBytes(dbEstimate.estimatedTableSize)}`); + console.log(` Index size: ${formatBytes(dbEstimate.estimatedIndexSize)}`); + console.log(` Total estimated size: ${formatBytes(dbEstimate.estimatedTotalSize)}`); + + // Second pass: Process all products + console.log("\nšŸ”„ Processing all products..."); + const rows = projectRows(iterLinesFromGzip(OFF_JSONL_GZ_URL, true)); + const stats = await writeJsonl(rows, OUTPUT_PATH); + + console.log("\nField coverage (non-empty counts):"); + console.log(` brand: ${stats.nonEmpty.brand}`); + console.log(` name: ${stats.nonEmpty.name}`); + console.log(` ingredients: ${stats.nonEmpty.ingredients}`); + console.log(` images: ${stats.nonEmpty.images}`); + + // Show validation summary + if (stats.validationStats) { + console.log("\nšŸ“Š Data Quality Summary:"); + const validRate = ((stats.validationStats.validProducts / stats.validationStats.totalLines) * 100).toFixed(1); + console.log(` Success rate: ${validRate}% (${stats.validationStats.validProducts.toLocaleString()} valid out of ${stats.validationStats.totalLines.toLocaleString()} total)`); + console.log(` Invalid breakdown:`); + console.log(` - Empty lines: ${stats.validationStats.emptyLines.toLocaleString()}`); + console.log(` - JSON parse errors: ${stats.validationStats.jsonParseErrors.toLocaleString()}`); + console.log(` - No barcode: ${stats.validationStats.noBarcode.toLocaleString()}`); + } + + const proceed = await askUploadPermission({ ...stats, dbEstimate }); + if (!proceed) { + console.log("Upload skipped."); + return; + } + } else { + // Check if the local file exists + try { + const stat = await Deno.stat(OUTPUT_PATH); + console.log(`šŸ“ Found existing file: ${OUTPUT_PATH} (${formatBytes(stat.size)})`); + } catch { + console.error(`āŒ No existing ${OUTPUT_PATH} file found. Run without --upload-only first.`); + return; + } + } + console.log("\nUploading to Supabase (batched upserts)..."); + const start = Date.now(); + await uploadJsonlToSupabase(OUTPUT_PATH); + const elapsed = (Date.now() - start) / 1000; + console.log(`Done in ${elapsed.toFixed(1)}s.`); +} + +if (import.meta.main) { + await main().catch((err) => { + console.error("Error:", err); + Deno.exit(1); + }); +} + + diff --git a/local/openfoodfacts/off_upload_batch.ts b/local/openfoodfacts/off_upload_batch.ts new file mode 100644 index 0000000..7d82820 --- /dev/null +++ b/local/openfoodfacts/off_upload_batch.ts @@ -0,0 +1,56 @@ +// Child process for uploading a single batch to Supabase +import { createClient } from "npm:@supabase/supabase-js@2.39.3"; + +async function uploadBatch() { + const batchFile = Deno.args[0]; + const batchNumber = Deno.args[1]; + + if (!batchFile || !batchNumber) { + console.error("Usage: deno run off_upload_batch.ts "); + Deno.exit(1); + } + + const url = Deno.env.get("SUPABASE_URL") ?? ""; + const key = Deno.env.get("SUPABASE_SECRET_KEY") ?? Deno.env.get("SUPABASE_SERVICE_ROLE_KEY") ?? ""; + + if (!url || !key) { + console.error("SUPABASE_URL and SUPABASE_SECRET_KEY must be set"); + Deno.exit(1); + } + + const supabase = createClient(url, key, { auth: { persistSession: false } }); + + try { + // Read batch file + const batchData = await Deno.readTextFile(batchFile); + const rows = batchData.trim().split('\n').map(line => JSON.parse(line)); + + // Upload to Supabase + const { error } = await supabase + .from('inventory_cache') + .upsert(rows, { onConflict: 'barcode' }); + + if (error) { + console.error(`āŒ Batch ${batchNumber} failed:`, error.message); + Deno.exit(1); + } + + // Only log every 10th batch to reduce output + if (parseInt(batchNumber) % 10 === 0) { + console.log(`āœ… Uploaded batch ${batchNumber} (${rows.length} rows)`); + } + + } catch (error) { + console.error(`āŒ Batch ${batchNumber} error:`, error.message); + Deno.exit(1); + } finally { + // Clean up temp file + try { + await Deno.remove(batchFile); + } catch { + // Ignore cleanup errors + } + } +} + +uploadBatch(); From 51817125a6e0bbe8394760b9a80a698b86ca28a7 Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Wed, 1 Oct 2025 15:52:12 +0530 Subject: [PATCH 15/21] Remove log_inventory table and replace with inventory_cache - Removed log_inventory table definition from tables.sql - Updated get_check_history and get_list_items functions to use inventory_cache - Removed background/log_inventory endpoint - Created consolidated getProductFromCache() function as single source of truth - Updated all code to query inventory_cache instead of log_inventory - Removed redundant lookupProduct wrapper function - inventory.ts now checks cache first, falls back to fresh API fetch --- supabase/database/tables.sql | 81 ++++-------- supabase/functions/background/index.ts | 19 --- supabase/functions/ingredicheck/analyzer.ts | 61 +++------ supabase/functions/ingredicheck/inventory.ts | 127 +++++++++++++++---- 4 files changed, 142 insertions(+), 146 deletions(-) diff --git a/supabase/database/tables.sql b/supabase/database/tables.sql index 14f91a5..7e9c6e1 100644 --- a/supabase/database/tables.sql +++ b/supabase/database/tables.sql @@ -104,33 +104,6 @@ CREATE POLICY user_update_own_log_infer ON public.log_feedback -------------------------------------------------------------------------------- -create table - public.log_inventory ( - created_at timestamp with time zone not null default now(), - start_time timestamp with time zone, - end_time timestamp with time zone, - user_id uuid not null, - client_activity_id uuid, - barcode text not null, - data_source text not null, - name text, - brand text, - ingredients json, - images json - ) tablespace pg_default; - -alter table public.log_inventory enable row level security; - -create policy "Select for all authenticated users" on public.log_inventory - for select - using (true); - -create policy "Insert for authenticated users" on public.log_inventory - for insert - with check (auth.uid() = user_id); - --------------------------------------------------------------------------------- - create table public.inventory_traderjoes ( created_at timestamp with time zone not null default now(), @@ -305,12 +278,12 @@ BEGIN SELECT DISTINCT ON (barcode, name, brand) la.created_at, la.client_activity_id, - COALESCE(li.barcode, le.barcode) AS barcode, - COALESCE(li.name, le.name) AS name, - COALESCE(li.brand, le.brand) AS brand, - COALESCE(li.ingredients, le.ingredients) AS ingredients, + COALESCE(le.barcode, ic.barcode) AS barcode, + COALESCE(ic.name, le.name) AS name, + COALESCE(ic.brand, le.brand) AS brand, + COALESCE(ic.ingredients::json, le.ingredients) AS ingredients, COALESCE( - li.images, + ic.images::json, (SELECT json_agg(json_build_object('imageFileHash', text_val)) FROM unnest(le.images) AS dt(text_val)) ) AS images, la.response_body AS ingredient_recommendations, @@ -324,31 +297,27 @@ BEGIN ) AS favorited FROM public.log_analyzebarcode la - LEFT JOIN public.log_inventory li - ON la.client_activity_id = li.client_activity_id LEFT JOIN public.log_extract le ON la.client_activity_id = le.client_activity_id + LEFT JOIN public.inventory_cache ic + ON le.barcode = ic.barcode LEFT JOIN public.log_feedback lf ON la.client_activity_id = lf.client_activity_id WHERE la.created_at > '2024-03-15'::date AND - ( - li.client_activity_id IS NOT NULL - OR - le.client_activity_id IS NOT NULL - ) + le.client_activity_id IS NOT NULL AND ( search_query IS NULL OR - to_tsvector('english', COALESCE(li.name, le.name) || ' ' || COALESCE(li.brand, le.brand) || ' ' || COALESCE(li.ingredients::text, le.ingredients::text)) @@ plainto_tsquery('english', search_query) + to_tsvector('english', COALESCE(ic.name, le.name) || ' ' || COALESCE(ic.brand, le.brand) || ' ' || COALESCE(ic.ingredients::text, le.ingredients::text)) @@ plainto_tsquery('english', search_query) OR - COALESCE(li.name, le.name) ILIKE '%' || search_query || '%' + COALESCE(ic.name, le.name) ILIKE '%' || search_query || '%' OR - COALESCE(li.brand, le.brand) ILIKE '%' || search_query || '%' + COALESCE(ic.brand, le.brand) ILIKE '%' || search_query || '%' OR - COALESCE(li.ingredients::text, le.ingredients::text) ILIKE '%' || search_query || '%' + COALESCE(ic.ingredients::text, le.ingredients::text) ILIKE '%' || search_query || '%' ) ORDER BY barcode, name, brand, la.created_at DESC @@ -380,37 +349,33 @@ BEGIN uli.created_at, uli.list_id, uli.list_item_id, - COALESCE(li.barcode, le.barcode) AS barcode, - COALESCE(li.name, le.name) AS name, - COALESCE(li.brand, le.brand) AS brand, - COALESCE(li.ingredients, le.ingredients::json) AS ingredients, + COALESCE(le.barcode, ic.barcode) AS barcode, + COALESCE(ic.name, le.name) AS name, + COALESCE(ic.brand, le.brand) AS brand, + COALESCE(ic.ingredients::json, le.ingredients::json) AS ingredients, COALESCE( - li.images, + ic.images::json, (SELECT json_agg(json_build_object('imageFileHash', text_val)) FROM unnest(le.images) AS dt(text_val)) ) AS images FROM public.user_list_items uli - LEFT JOIN public.log_inventory li ON uli.list_item_id = li.client_activity_id LEFT JOIN public.log_extract le ON uli.list_item_id = le.client_activity_id + LEFT JOIN public.inventory_cache ic ON le.barcode = ic.barcode WHERE uli.list_id = input_list_id AND - ( - li.client_activity_id IS NOT NULL - OR - le.client_activity_id IS NOT NULL - ) + le.client_activity_id IS NOT NULL AND ( search_query IS NULL OR - to_tsvector('english', COALESCE(li.name, le.name) || ' ' || COALESCE(li.brand, le.brand) || ' ' || COALESCE(li.ingredients::text, le.ingredients::text)) @@ plainto_tsquery('english', search_query) + to_tsvector('english', COALESCE(ic.name, le.name) || ' ' || COALESCE(ic.brand, le.brand) || ' ' || COALESCE(ic.ingredients::text, le.ingredients::text)) @@ plainto_tsquery('english', search_query) OR - COALESCE(li.name, le.name) ILIKE '%' || search_query || '%' + COALESCE(ic.name, le.name) ILIKE '%' || search_query || '%' OR - COALESCE(li.brand, le.brand) ILIKE '%' || search_query || '%' + COALESCE(ic.brand, le.brand) ILIKE '%' || search_query || '%' OR - COALESCE(li.ingredients::text, le.ingredients::text) ILIKE '%' || search_query || '%' + COALESCE(ic.ingredients::text, le.ingredients::text) ILIKE '%' || search_query || '%' ) ORDER BY uli.created_at DESC; diff --git a/supabase/functions/background/index.ts b/supabase/functions/background/index.ts index b7219d7..32da2bd 100644 --- a/supabase/functions/background/index.ts +++ b/supabase/functions/background/index.ts @@ -45,25 +45,6 @@ router } ctx.response.status = 201 }) - .post('/background/log_inventory', async (ctx) => { - const body = ctx.request.body({ type: 'json', limit: 0 }) - const body_json = await body.value - const user_id = await KitchenSink.getUserId(ctx) - const entry = { - ...body_json, - user_id: user_id, - } - const result = await ctx.state.supabaseClient - .from('log_inventory') - .insert(entry) - if (result.error) { - console.log('supabaseClient.from(log_inventory).insert() failed: ', result.error) - ctx.response.status = 500 - ctx.response.body = result.error - return - } - ctx.response.status = 201 - }) .post('/background/log_llmcalls', async (ctx) => { const body = ctx.request.body({ type: 'json', limit: 0 }) const body_json = await body.value diff --git a/supabase/functions/ingredicheck/analyzer.ts b/supabase/functions/ingredicheck/analyzer.ts index cddb36e..cfe5113 100644 --- a/supabase/functions/ingredicheck/analyzer.ts +++ b/supabase/functions/ingredicheck/analyzer.ts @@ -72,7 +72,7 @@ export async function streamInventoryAndAnalysis(ctx: Context) { return; } - const inventoryResult = await Inventory.fetchProduct({ + const inventoryResult = await Inventory.getProductFromCache({ supabaseClient: ctx.state.supabaseClient, barcode, clientActivityId, @@ -162,7 +162,23 @@ export async function performAnalysis( ctx.state.clientActivityId = requestBody.clientActivityId; - const product = productOverride ?? await lookupProduct(ctx, requestBody); + let product: DB.Product; + + if (productOverride) { + product = productOverride; + } else { + const result = await Inventory.getProductFromCache({ + supabaseClient: ctx.state.supabaseClient, + barcode: requestBody.barcode, + clientActivityId: ctx.state.clientActivityId, + }); + + if (result.status !== 200 || !result.product) { + throw new Error(result.error ?? "Product not found"); + } + + product = result.product; + } const hasValidPreferences = requestBody.userPreferenceText && requestBody.userPreferenceText.trim() !== "" && @@ -214,44 +230,3 @@ export async function logAnalysisResult( console.error("Failed to log analyze barcode event", error); } } - -async function lookupProduct( - ctx: Context, - requestBody: AnalysisRequest, -): Promise { - if (requestBody.barcode !== undefined) { - const result = await ctx.state.supabaseClient - .from("log_inventory") - .select() - .eq("barcode", requestBody.barcode) - .order("created_at", { ascending: false }) - .limit(1) - .single(); - - if (result.error) { - throw result.error; - } - - return result.data as DB.Product; - } - - const result = await ctx.state.supabaseClient - .from("log_extract") - .select() - .eq("client_activity_id", ctx.state.clientActivityId) - .order("created_at", { ascending: false }) - .limit(1) - .single(); - - if (result.error) { - throw result.error; - } - - return { - barcode: result.data.barcode, - brand: result.data.brand, - name: result.data.name, - ingredients: result.data.ingredients ?? [], - images: [], - }; -} diff --git a/supabase/functions/ingredicheck/inventory.ts b/supabase/functions/ingredicheck/inventory.ts index 5c188a5..5d577e6 100644 --- a/supabase/functions/ingredicheck/inventory.ts +++ b/supabase/functions/ingredicheck/inventory.ts @@ -13,21 +13,93 @@ type InventoryFetchResult = { error?: string; }; +type InventoryCacheOptions = { + supabaseClient: any; + barcode?: string; + clientActivityId?: string; +}; + +type InventoryCacheResult = { + status: number; + product: DB.Product | null; + error?: string; +}; + +/** + * Queries the inventory_cache for a product by barcode. + * If no barcode is provided, falls back to log_extract by clientActivityId. + */ +export async function getProductFromCache( + options: InventoryCacheOptions, +): Promise { + const { supabaseClient, barcode, clientActivityId } = options; + + // Query inventory_cache if barcode is provided + if (barcode !== undefined) { + const result = await supabaseClient + .from("inventory_cache") + .select() + .eq("barcode", barcode) + .single(); + + if (result.error) { + return { + status: 404, + product: null, + error: result.error.message ?? "Product not found in cache.", + }; + } + + return { + status: 200, + product: result.data as DB.Product, + }; + } + + // Fallback to log_extract if no barcode provided + if (clientActivityId !== undefined) { + const result = await supabaseClient + .from("log_extract") + .select() + .eq("client_activity_id", clientActivityId) + .order("created_at", { ascending: false }) + .limit(1) + .single(); + + if (result.error) { + return { + status: 404, + product: null, + error: result.error.message ?? "Product not found in extract log.", + }; + } + + return { + status: 200, + product: { + barcode: result.data.barcode, + brand: result.data.brand, + name: result.data.name, + ingredients: result.data.ingredients ?? [], + images: [], + }, + }; + } + + return { + status: 400, + product: null, + error: "Either barcode or clientActivityId must be provided.", + }; +} + export async function fetchProduct( options: InventoryFetchOptions, ): Promise { - const { supabaseClient, barcode, clientActivityId } = options; + const { barcode } = options; let product: DB.Product | null = null; let errorMessage: string | undefined; - - const log_json: Record = { - start_time: new Date(), - barcode: barcode, - data_source: "openfoodfacts/v3", - client_activity_id: clientActivityId, - }; - let status = 200; try { @@ -44,7 +116,6 @@ export async function fetchProduct( errorMessage = data.status_verbose || "Product not found."; } else { product = processOpenFoodFactsProductData(barcode, data.product); - Object.assign(log_json, product); } } catch (error) { status = 500; @@ -52,17 +123,6 @@ export async function fetchProduct( console.error(`Failed to fetch product ${barcode}: ${errorMessage}`); } - log_json.end_time = new Date(); - log_json.response_status = status; - if (errorMessage) { - log_json.error = errorMessage; - } - - await supabaseClient.functions.invoke("background/log_inventory", { - body: log_json, - method: "POST", - }); - return { status, product, @@ -75,18 +135,33 @@ export async function get( barcode: string, clientActivityId: string | null, ) { - const result = await fetchProduct({ + // First, try to get product from cache + const cacheResult = await getProductFromCache({ + supabaseClient: ctx.state.supabaseClient, + barcode, + clientActivityId: clientActivityId ?? undefined, + }); + + // If found in cache, return it + if (cacheResult.status === 200 && cacheResult.product) { + ctx.response.status = 200; + ctx.response.body = cacheResult.product; + return; + } + + // If not in cache, fetch from OpenFoodFacts (fetchProduct is still available as fallback) + const fetchResult = await fetchProduct({ supabaseClient: ctx.state.supabaseClient, barcode, clientActivityId, }); - ctx.response.status = result.status; - if (result.status === 200 && result.product) { - ctx.response.body = result.product; + ctx.response.status = fetchResult.status; + if (fetchResult.status === 200 && fetchResult.product) { + ctx.response.body = fetchResult.product; } else { ctx.response.body = { - error: result.error ?? "Unexpected inventory error.", + error: fetchResult.error ?? "Unexpected inventory error.", }; } } From 111a945a2f622ca2059f4fff1536ba092386e4d6 Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Wed, 1 Oct 2025 16:04:58 +0530 Subject: [PATCH 16/21] Remove OpenFoodFacts API fallback, make inventory endpoint cache-only - Removed fetchProduct() function and OpenFoodFacts API integration - Removed processOpenFoodFactsProductData() and helper functions - Simplified get() endpoint to only read from inventory_cache - Reduced file from 252 lines to 104 lines (67% reduction) - Now returns 404 if product not found in cache --- supabase/functions/ingredicheck/inventory.ts | 158 +------------------ 1 file changed, 5 insertions(+), 153 deletions(-) diff --git a/supabase/functions/ingredicheck/inventory.ts b/supabase/functions/ingredicheck/inventory.ts index 5d577e6..6330cda 100644 --- a/supabase/functions/ingredicheck/inventory.ts +++ b/supabase/functions/ingredicheck/inventory.ts @@ -1,18 +1,6 @@ import { Context } from "https://deno.land/x/oak@v12.6.0/mod.ts"; import * as DB from "../shared/db.ts"; -type InventoryFetchOptions = { - supabaseClient: any; - barcode: string; - clientActivityId?: string | null; -}; - -type InventoryFetchResult = { - status: number; - product: DB.Product | null; - error?: string; -}; - type InventoryCacheOptions = { supabaseClient: any; barcode?: string; @@ -93,159 +81,23 @@ export async function getProductFromCache( }; } -export async function fetchProduct( - options: InventoryFetchOptions, -): Promise { - const { barcode } = options; - - let product: DB.Product | null = null; - let errorMessage: string | undefined; - let status = 200; - - try { - const url = - `https://world.openfoodfacts.org/api/v3/product/${barcode}.json`; - const response = await fetch(url); - const data = await response.json(); - - if (data.status === "failure") { - console.log( - `Unexpected product details: ${JSON.stringify(data, null, 2)}`, - ); - status = 404; - errorMessage = data.status_verbose || "Product not found."; - } else { - product = processOpenFoodFactsProductData(barcode, data.product); - } - } catch (error) { - status = 500; - errorMessage = (error as Error).message; - console.error(`Failed to fetch product ${barcode}: ${errorMessage}`); - } - - return { - status, - product, - error: errorMessage, - }; -} - export async function get( ctx: Context, barcode: string, clientActivityId: string | null, ) { - // First, try to get product from cache - const cacheResult = await getProductFromCache({ + const result = await getProductFromCache({ supabaseClient: ctx.state.supabaseClient, barcode, clientActivityId: clientActivityId ?? undefined, }); - // If found in cache, return it - if (cacheResult.status === 200 && cacheResult.product) { - ctx.response.status = 200; - ctx.response.body = cacheResult.product; - return; - } - - // If not in cache, fetch from OpenFoodFacts (fetchProduct is still available as fallback) - const fetchResult = await fetchProduct({ - supabaseClient: ctx.state.supabaseClient, - barcode, - clientActivityId, - }); - - ctx.response.status = fetchResult.status; - if (fetchResult.status === 200 && fetchResult.product) { - ctx.response.body = fetchResult.product; + ctx.response.status = result.status; + if (result.status === 200 && result.product) { + ctx.response.body = result.product; } else { ctx.response.body = { - error: fetchResult.error ?? "Unexpected inventory error.", - }; - } -} - -type SelectedImages = { - [key: string]: { - display: { - [key: string]: string; + error: result.error ?? "Product not found in cache.", }; - }; -}; - -type ImageUrl = { - url: string; -}; - -function extractDisplayImageUrls(selectedImages?: SelectedImages): ImageUrl[] { - if (selectedImages) { - return Object.values(selectedImages).flatMap((image) => { - if (image.display?.en) { - return [{ - url: image.display.en, - }]; - } - return []; - }); - } - return []; -} - -function processOpenFoodFactsProductData( - barcode: string, - product: any, -): DB.Product { - let brand: string | undefined = undefined; - let name: string | undefined = undefined; - let ingredients: any[] = []; - - if (product.brand_owner) { - brand = product.brand_owner; } - - if (product.product_name) { - name = product.product_name; - } - - if (product.ingredients) { - ingredients = product.ingredients.map((i: any) => { - return { - name: i.text, - vegan: i.vegan, - vegetarian: i.vegetarian, - ingredients: i.ingredients?.map((i2: any) => { - return { - name: i2.text, - vegan: i2.vegan, - vegetarian: i2.vegetarian, - ingredients: i2.ingredients?.map((i3: any) => { - return { - name: i3.text, - vegan: i3.vegan, - vegetarian: i3.vegetarian, - ingredients: [], - }; - }) ?? [], - }; - }) ?? [], - }; - }); - } - - const images = extractDisplayImageUrls(product.selected_images); - - // Workaround for known issues with OpenFoodFacts data - if (barcode === "0096619362776") { - // Label says 'Contains No Animal Rennet', but ingredient list has 'Animal Rennet'. - ingredients = ingredients.filter((i) => i.name !== "Animal Rennet"); - } - - return { - barcode: barcode, - brand: brand, - name: name, - ingredients: ingredients, - images: images, - }; } From d4487cf52df615173cc226d7f420f3f9af01692c Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Wed, 1 Oct 2025 16:35:27 +0530 Subject: [PATCH 17/21] Fix Open Food Facts image URL generation - Add barcodeToPath() function to convert barcodes to proper OFF path format (e.g., 088/491/237/3946) - Fix image URL format: full size uses imgId.jpg instead of imgId.WxH.jpg - Add validateImageUrlExists() function for optional HEAD request validation (currently disabled) - Update extractDisplayImageUrls() to use correct barcode paths in URLs - All image URLs now follow format: /images/products/{barcodePath}/{imgId}.{size}.jpg --- local/openfoodfacts/off_ingest.ts | 57 +++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/local/openfoodfacts/off_ingest.ts b/local/openfoodfacts/off_ingest.ts index 0cd8263..2e9040d 100644 --- a/local/openfoodfacts/off_ingest.ts +++ b/local/openfoodfacts/off_ingest.ts @@ -18,7 +18,7 @@ async function loadEnv() { } } } catch (error) { - console.warn("āš ļø Could not load .env file:", error.message); + console.warn("āš ļø Could not load .env file:", (error as Error).message); console.warn(" Make sure to copy .env.template to .env and fill in your values"); } } @@ -81,13 +81,42 @@ function isValidImageUrl(url: string): boolean { } } -function extractDisplayImageUrls(images: any): Image[] { +async function validateImageUrlExists(url: string): Promise { + try { + const response = await fetch(url, { method: 'HEAD' }); + return response.status === 200; + } catch { + return false; + } +} + +// Convert barcode to Open Food Facts path format +// Example: "3017620422003" -> "301/762/042/2003" +// Example: "1" -> "1" +function barcodeToPath(barcode: string): string { + // Short barcodes (8 digits or fewer) are used as-is + if (barcode.length <= 8) { + return barcode; + } + // Longer barcodes are padded to at least 13 digits and split + const code = barcode.padStart(13, '0'); + // Split into segments of 3 digits, except the last part + const segments: string[] = []; + for (let i = 0; i < code.length - 4; i += 3) { + segments.push(code.slice(i, i + 3)); + } + segments.push(code.slice(code.length - 4)); // Last 4 digits + return segments.join('/'); +} + +function extractDisplayImageUrls(images: any, barcode: string): Image[] { if (!images || typeof images !== "object") { return []; } const urls: Image[] = []; const processedImages = new Set(); // Track processed image IDs to avoid duplicates + const barcodePath = barcodeToPath(barcode); try { // Open Food Facts image structure: images contains both numeric keys (1,2,3,4) and language-specific keys (front_en, ingredients_fr, etc.) @@ -99,7 +128,7 @@ function extractDisplayImageUrls(images: any): Image[] { for (const langKey of languageKeys) { const imageRef = images[langKey]; if (imageRef && typeof imageRef === "object" && imageRef.imgid) { - const imgId = imageRef.imgid; + const imgId = String(imageRef.imgid); // Ensure imgid is a string if (processedImages.has(imgId)) continue; // Skip if already processed const imageData = images[imgId]; @@ -110,28 +139,28 @@ function extractDisplayImageUrls(images: any): Image[] { const imageUrls: { url: string; resolution: string; width: number; height: number }[] = []; if (sizes.full && sizes.full.w && sizes.full.h) { - const url = `https://static.openfoodfacts.org/images/products/${imgId}/front.${sizes.full.w}x${sizes.full.h}.jpg`; + const url = `https://static.openfoodfacts.org/images/products/${barcodePath}/${imgId}.jpg`; if (isValidImageUrl(url)) { imageUrls.push({ url, resolution: 'full', width: sizes.full.w, height: sizes.full.h }); } } if (sizes["400"] && sizes["400"].w && sizes["400"].h) { - const url = `https://static.openfoodfacts.org/images/products/${imgId}/front.400x${sizes["400"].h}.jpg`; + const url = `https://static.openfoodfacts.org/images/products/${barcodePath}/${imgId}.400.jpg`; if (isValidImageUrl(url)) { imageUrls.push({ url, resolution: '400px', width: sizes["400"].w, height: sizes["400"].h }); } } if (sizes["200"] && sizes["200"].w && sizes["200"].h) { - const url = `https://static.openfoodfacts.org/images/products/${imgId}/front.200x${sizes["200"].h}.jpg`; + const url = `https://static.openfoodfacts.org/images/products/${barcodePath}/${imgId}.200.jpg`; if (isValidImageUrl(url)) { imageUrls.push({ url, resolution: '200px', width: sizes["200"].w, height: sizes["200"].h }); } } if (sizes["100"] && sizes["100"].w && sizes["100"].h) { - const url = `https://static.openfoodfacts.org/images/products/${imgId}/front.100x${sizes["100"].h}.jpg`; + const url = `https://static.openfoodfacts.org/images/products/${barcodePath}/${imgId}.100.jpg`; if (isValidImageUrl(url)) { imageUrls.push({ url, resolution: '100px', width: sizes["100"].w, height: sizes["100"].h }); } @@ -155,12 +184,12 @@ function extractDisplayImageUrls(images: any): Image[] { // If no language-specific front images found, try any numeric image if (urls.length === 0) { for (const [key, imageData] of Object.entries(images)) { - if (/^\d+$/.test(key) && imageData && typeof imageData === "object" && imageData.sizes) { - const sizes = imageData.sizes; + if (/^\d+$/.test(key) && imageData && typeof imageData === "object" && (imageData as any).sizes) { + const sizes = (imageData as any).sizes; // Collect all available sizes for this image if (sizes.full && sizes.full.w && sizes.full.h) { - const url = `https://static.openfoodfacts.org/images/products/${key}/front.${sizes.full.w}x${sizes.full.h}.jpg`; + const url = `https://static.openfoodfacts.org/images/products/${barcodePath}/${key}.jpg`; if (isValidImageUrl(url)) { urls.push({ url, @@ -172,7 +201,7 @@ function extractDisplayImageUrls(images: any): Image[] { } if (sizes["400"] && sizes["400"].w && sizes["400"].h) { - const url = `https://static.openfoodfacts.org/images/products/${key}/front.400x${sizes["400"].h}.jpg`; + const url = `https://static.openfoodfacts.org/images/products/${barcodePath}/${key}.400.jpg`; if (isValidImageUrl(url)) { urls.push({ url, @@ -187,7 +216,7 @@ function extractDisplayImageUrls(images: any): Image[] { } } } - } catch (error) { + } catch (_error) { // ignore malformed structures } @@ -232,7 +261,7 @@ function mapToCacheRow(product: any): CacheRow | null { ingredients = product.ingredients.filter((x: any) => x && typeof x === "object").map(mapIngredient); } - const images = extractDisplayImageUrls(product?.images); + const images = extractDisplayImageUrls(product?.images, barcode); const off_last_modified_t = typeof product?.last_modified_t === "number" ? product.last_modified_t : undefined; @@ -651,7 +680,7 @@ async function uploadJsonlToSupabase(path: string) { } } } catch (error) { - console.error(`āŒ JSON parse error on line:`, error.message); + console.error(`āŒ JSON parse error on line:`, (error as Error).message); // skip invalid JSON } } From 42866198a50a3e1463905b927e807ad4636e0e79 Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Wed, 1 Oct 2025 22:49:40 +0530 Subject: [PATCH 18/21] TEMP: Modify upload to only update images column - Only upsert barcode and images fields during upload - Remove updated_at field from upserts (column may not exist) - Add ignoreDuplicates: false to ensure existing rows are updated - Improve error logging with JSON.stringify for better debugging - This is a temporary change to fix image URLs without reprocessing all data --- local/openfoodfacts/off_ingest.ts | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/local/openfoodfacts/off_ingest.ts b/local/openfoodfacts/off_ingest.ts index 2e9040d..4c1f401 100644 --- a/local/openfoodfacts/off_ingest.ts +++ b/local/openfoodfacts/off_ingest.ts @@ -538,7 +538,7 @@ async function askUploadPermission(stats: { count: number; totalBytes: number; d async function uploadJsonlToSupabase(path: string) { - console.log("šŸ“¤ Starting upload to Supabase..."); + console.log("šŸ“¤ Starting upload to Supabase (TEMPORARY: only updating images column)..."); // Load env for Supabase connection const url = Deno.env.get("SUPABASE_URL") ?? ""; @@ -639,18 +639,24 @@ async function uploadJsonlToSupabase(path: string) { // Now create and add the upload promise const uploadPromise = (async () => { try { + // TEMPORARY: Only update images column + const imageUpdates = batch.map(row => ({ + barcode: row.barcode, + images: row.images + })); + const { error } = await supabase .from('inventory_cache') - .upsert(batch, { onConflict: 'barcode' }); + .upsert(imageUpdates, { onConflict: 'barcode', ignoreDuplicates: false }); if (error) { - console.error(`āŒ Batch ${currentBatchNum} failed:`, error.message); + console.error(`āŒ Batch ${currentBatchNum} failed:`, JSON.stringify(error)); throw error; } // Only log every 10 batches if (currentBatchNum % 10 === 0) { - console.log(`āœ… Uploaded batch ${currentBatchNum} (${batch.length} rows)`); + console.log(`āœ… Updated images for batch ${currentBatchNum} (${batch.length} rows)`); } } catch (error) { console.error(`āŒ Upload error at batch ${currentBatchNum}:`, error); @@ -722,16 +728,22 @@ async function uploadJsonlToSupabase(path: string) { const uploadPromise = (async () => { try { + // TEMPORARY: Only update images column + const imageUpdates = batch.map(row => ({ + barcode: row.barcode, + images: row.images + })); + const { error } = await supabase .from('inventory_cache') - .upsert(batch, { onConflict: 'barcode' }); + .upsert(imageUpdates, { onConflict: 'barcode', ignoreDuplicates: false }); if (error) { - console.error(`āŒ Final batch ${currentBatchNum} failed:`, error.message); + console.error(`āŒ Final batch ${currentBatchNum} failed:`, JSON.stringify(error)); throw error; } - console.log(`āœ… Uploaded final batch ${currentBatchNum} (${batch.length} rows)`); + console.log(`āœ… Updated images for final batch ${currentBatchNum} (${batch.length} rows)`); } catch (error) { console.error(`āŒ Upload error at final batch ${currentBatchNum}:`, error); throw error; @@ -748,7 +760,7 @@ async function uploadJsonlToSupabase(path: string) { console.log(`āœ… All uploads completed!`); } - console.log(`āœ… Upload complete! ${total} rows processed in ${batchCount} batches`); + console.log(`āœ… Image update complete! ${total} rows processed in ${batchCount} batches`); } finally { file.close(); @@ -838,7 +850,7 @@ Examples: return; } } - console.log("\nUploading to Supabase (batched upserts)..."); + console.log("\nUploading to Supabase (TEMPORARY: only updating images column)..."); const start = Date.now(); await uploadJsonlToSupabase(OUTPUT_PATH); const elapsed = (Date.now() - start) / 1000; From 2468a517f5c4926909173850f631218f79dcdd10 Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Wed, 1 Oct 2025 22:52:17 +0530 Subject: [PATCH 19/21] Handle barcode matching with or without leading zeros - Add barcode_matches() SQL function that pads upward only to avoid false matches - Update inventory query to try multiple barcode format variants (EAN-8, UPC-A, EAN-13, ITF-14) - Apply barcode_matches() to JOINs in get_check_history() and get_list_items() - Prevents 8-digit barcodes from matching unrelated 13-digit barcodes - Fixes case where barcode 884912373946 should match 0884912373946 --- supabase/database/tables.sql | 56 +++++++++++++++++++- supabase/functions/ingredicheck/inventory.ts | 39 +++++++++++++- 2 files changed, 91 insertions(+), 4 deletions(-) diff --git a/supabase/database/tables.sql b/supabase/database/tables.sql index 7e9c6e1..bd404bb 100644 --- a/supabase/database/tables.sql +++ b/supabase/database/tables.sql @@ -40,6 +40,58 @@ create trigger trg_inventory_cache_updated_at before update on public.inventory_cache for each row execute function set_inventory_cache_updated_at(); +-- Function to match barcodes with or without leading zeros +-- Only pads UPWARD to avoid false matches between different barcode types +-- Based on inventory stats: 93% are 13-digit, 5% are 8-digit +-- Examples: "884912373946" (12) matches "0884912373946" (13) āœ“ +-- "12345678" (8) does NOT match "0000012345678" (13) āœ— +create or replace function barcode_matches(barcode1 text, barcode2 text) +returns boolean as $$ +declare + len1 int; + len2 int; + min_len int; +begin + if barcode1 is null or barcode2 is null then + return false; + end if; + + -- Direct match (fastest check) + if barcode1 = barcode2 then + return true; + end if; + + len1 := length(barcode1); + len2 := length(barcode2); + min_len := least(len1, len2); + + -- Only pad to lengths equal to or greater than the shorter barcode + -- This prevents 8-digit codes from matching unrelated 13-digit codes + + -- Try 8 digits (EAN-8) only if shortest is <= 8 + if min_len <= 8 and lpad(barcode1, 8, '0') = lpad(barcode2, 8, '0') then + return true; + end if; + + -- Try 12 digits (UPC-A) only if shortest is <= 12 + if min_len <= 12 and lpad(barcode1, 12, '0') = lpad(barcode2, 12, '0') then + return true; + end if; + + -- Try 13 digits (EAN-13) only if shortest is <= 13 + if min_len <= 13 and lpad(barcode1, 13, '0') = lpad(barcode2, 13, '0') then + return true; + end if; + + -- Try 14 digits (ITF-14) only if shortest is <= 14 + if min_len <= 14 and lpad(barcode1, 14, '0') = lpad(barcode2, 14, '0') then + return true; + end if; + + return false; +end; +$$ language plpgsql immutable; + -------------------------------------------------------------------------------- create table @@ -300,7 +352,7 @@ BEGIN LEFT JOIN public.log_extract le ON la.client_activity_id = le.client_activity_id LEFT JOIN public.inventory_cache ic - ON le.barcode = ic.barcode + ON barcode_matches(le.barcode, ic.barcode) LEFT JOIN public.log_feedback lf ON la.client_activity_id = lf.client_activity_id WHERE @@ -360,7 +412,7 @@ BEGIN FROM public.user_list_items uli LEFT JOIN public.log_extract le ON uli.list_item_id = le.client_activity_id - LEFT JOIN public.inventory_cache ic ON le.barcode = ic.barcode + LEFT JOIN public.inventory_cache ic ON barcode_matches(le.barcode, ic.barcode) WHERE uli.list_id = input_list_id AND diff --git a/supabase/functions/ingredicheck/inventory.ts b/supabase/functions/ingredicheck/inventory.ts index 6330cda..34c1043 100644 --- a/supabase/functions/ingredicheck/inventory.ts +++ b/supabase/functions/ingredicheck/inventory.ts @@ -24,11 +24,38 @@ export async function getProductFromCache( // Query inventory_cache if barcode is provided if (barcode !== undefined) { + // Try to match barcodes with or without leading zeros + // Only pad UPWARD to avoid false matches between different barcode types + const variants = [barcode]; // Always include original + const len = barcode.length; + + if (len <= 8) { + // EAN-8 format (5% of inventory) - only pad to 8 + variants.push(barcode.padStart(8, '0')); + } else if (len <= 12) { + // UPC-A format - pad to 12, 13, 14 + variants.push(barcode.padStart(12, '0')); + variants.push(barcode.padStart(13, '0')); // UPC-A → EAN-13 conversion + variants.push(barcode.padStart(14, '0')); + } else if (len === 13) { + // EAN-13 format (93% of inventory) - pad to 13, 14 + variants.push(barcode.padStart(13, '0')); + variants.push(barcode.padStart(14, '0')); + } else { + // 14+ digits - only pad to 14 + variants.push(barcode.padStart(14, '0')); + } + + // Remove duplicates and create OR condition + const uniqueVariants = [...new Set(variants)]; + const orCondition = uniqueVariants.map(v => `barcode.eq.${v}`).join(','); + const result = await supabaseClient .from("inventory_cache") .select() - .eq("barcode", barcode) - .single(); + .or(orCondition) + .limit(1) + .maybeSingle(); if (result.error) { return { @@ -38,6 +65,14 @@ export async function getProductFromCache( }; } + if (!result.data) { + return { + status: 404, + product: null, + error: "Product not found in cache.", + }; + } + return { status: 200, product: result.data as DB.Product, From 3b418e9f7d8de7226c1f32a00fb5c28ffb8a2c26 Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Thu, 2 Oct 2025 08:34:45 +0530 Subject: [PATCH 20/21] Revert to full upserts with all columns - Remove temporary image-only update logic - Restore full batch upserts with all product data - Will populate fresh inventory_cache table with: - Fixed image URLs with correct barcode paths - All product metadata (name, brand, ingredients, etc.) - Proper last_refreshed_at timestamps --- local/openfoodfacts/off_ingest.ts | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/local/openfoodfacts/off_ingest.ts b/local/openfoodfacts/off_ingest.ts index 4c1f401..2e9040d 100644 --- a/local/openfoodfacts/off_ingest.ts +++ b/local/openfoodfacts/off_ingest.ts @@ -538,7 +538,7 @@ async function askUploadPermission(stats: { count: number; totalBytes: number; d async function uploadJsonlToSupabase(path: string) { - console.log("šŸ“¤ Starting upload to Supabase (TEMPORARY: only updating images column)..."); + console.log("šŸ“¤ Starting upload to Supabase..."); // Load env for Supabase connection const url = Deno.env.get("SUPABASE_URL") ?? ""; @@ -639,24 +639,18 @@ async function uploadJsonlToSupabase(path: string) { // Now create and add the upload promise const uploadPromise = (async () => { try { - // TEMPORARY: Only update images column - const imageUpdates = batch.map(row => ({ - barcode: row.barcode, - images: row.images - })); - const { error } = await supabase .from('inventory_cache') - .upsert(imageUpdates, { onConflict: 'barcode', ignoreDuplicates: false }); + .upsert(batch, { onConflict: 'barcode' }); if (error) { - console.error(`āŒ Batch ${currentBatchNum} failed:`, JSON.stringify(error)); + console.error(`āŒ Batch ${currentBatchNum} failed:`, error.message); throw error; } // Only log every 10 batches if (currentBatchNum % 10 === 0) { - console.log(`āœ… Updated images for batch ${currentBatchNum} (${batch.length} rows)`); + console.log(`āœ… Uploaded batch ${currentBatchNum} (${batch.length} rows)`); } } catch (error) { console.error(`āŒ Upload error at batch ${currentBatchNum}:`, error); @@ -728,22 +722,16 @@ async function uploadJsonlToSupabase(path: string) { const uploadPromise = (async () => { try { - // TEMPORARY: Only update images column - const imageUpdates = batch.map(row => ({ - barcode: row.barcode, - images: row.images - })); - const { error } = await supabase .from('inventory_cache') - .upsert(imageUpdates, { onConflict: 'barcode', ignoreDuplicates: false }); + .upsert(batch, { onConflict: 'barcode' }); if (error) { - console.error(`āŒ Final batch ${currentBatchNum} failed:`, JSON.stringify(error)); + console.error(`āŒ Final batch ${currentBatchNum} failed:`, error.message); throw error; } - console.log(`āœ… Updated images for final batch ${currentBatchNum} (${batch.length} rows)`); + console.log(`āœ… Uploaded final batch ${currentBatchNum} (${batch.length} rows)`); } catch (error) { console.error(`āŒ Upload error at final batch ${currentBatchNum}:`, error); throw error; @@ -760,7 +748,7 @@ async function uploadJsonlToSupabase(path: string) { console.log(`āœ… All uploads completed!`); } - console.log(`āœ… Image update complete! ${total} rows processed in ${batchCount} batches`); + console.log(`āœ… Upload complete! ${total} rows processed in ${batchCount} batches`); } finally { file.close(); @@ -850,7 +838,7 @@ Examples: return; } } - console.log("\nUploading to Supabase (TEMPORARY: only updating images column)..."); + console.log("\nUploading to Supabase (batched upserts)..."); const start = Date.now(); await uploadJsonlToSupabase(OUTPUT_PATH); const elapsed = (Date.now() - start) / 1000; From 9ac3c8df366a9ecfed20dcf90623681f48949776 Mon Sep 17 00:00:00 2001 From: justanotheratom Date: Thu, 2 Oct 2025 10:18:56 +0530 Subject: [PATCH 21/21] refactor: store raw image metadata instead of constructed URLs - Change ImageMetadata type to minimal structure (type, language, imgid, sizes[]) - Extract all image types: front, ingredients, nutrition, packaging - Extract images for all languages (en, fr, de, es, it, pt, nl, pl, ru, ja, zh, etc.) - Remove URL construction logic (moved to runtime in inventory.ts) - Remove unused helper functions: isValidImageUrl, validateImageUrlExists, barcodeToPath - Add comprehensive documentation about image storage strategy - Store only language keys, drop numeric keys to reduce storage by ~60% Image URLs will be constructed at runtime with smart selection: - Prefer English, fallback to other languages - Prefer 400px (medium), fallback to next available size - Return one image per type (front, ingredients, nutrition, packaging) --- local/openfoodfacts/off_ingest.ts | 205 ++++++++++-------------------- 1 file changed, 65 insertions(+), 140 deletions(-) diff --git a/local/openfoodfacts/off_ingest.ts b/local/openfoodfacts/off_ingest.ts index 2e9040d..cd88ce4 100644 --- a/local/openfoodfacts/off_ingest.ts +++ b/local/openfoodfacts/off_ingest.ts @@ -1,6 +1,15 @@ // deno run -A --unstable-kv local/openfoodfacts/off_ingest.ts // Environment: Copy .env.template to .env and fill in your values // Performance: Use --unstable-kv for better memory management +// +// IMAGE STORAGE STRATEGY: +// This script extracts raw image metadata (language keys only) from Open Food Facts +// and stores it in the database. Image URLs are NOT constructed here. +// At runtime, server code will: +// - Select appropriate images based on type (front, ingredients, nutrition, packaging) +// - Prefer English, fallback to other languages +// - Prefer medium resolution (400px), fallback to next available size +// - Construct URLs using: https://static.openfoodfacts.org/images/products/{barcodePath}/{imgId}.{size}.jpg // Load environment variables from .env file async function loadEnv() { @@ -30,11 +39,11 @@ type Ingredient = { ingredients?: Ingredient[]; }; -type Image = { - url: string; - resolution?: string; - width?: number; - height?: number; +type ImageMetadata = { + type: 'front' | 'ingredients' | 'nutrition' | 'packaging'; + language: string; // e.g., 'en', 'fr', 'de' + imgid: string; // numeric image ID (e.g., '1', '2', '3') + sizes: ('full' | '400' | '200' | '100')[]; // available sizes }; type CacheRow = { @@ -43,7 +52,7 @@ type CacheRow = { brand?: string; name?: string; ingredients: Ingredient[]; - images: Image[]; + images: ImageMetadata[]; off_last_modified_t?: number; }; @@ -69,158 +78,74 @@ function mapIngredient(node: any): Ingredient { return item; } -function isValidImageUrl(url: string): boolean { - try { - const parsedUrl = new URL(url); - return parsedUrl.protocol === 'https:' && - parsedUrl.hostname === 'static.openfoodfacts.org' && - url.includes('/images/products/') && - url.endsWith('.jpg'); - } catch { - return false; - } -} - -async function validateImageUrlExists(url: string): Promise { - try { - const response = await fetch(url, { method: 'HEAD' }); - return response.status === 200; - } catch { - return false; - } -} - -// Convert barcode to Open Food Facts path format -// Example: "3017620422003" -> "301/762/042/2003" -// Example: "1" -> "1" -function barcodeToPath(barcode: string): string { - // Short barcodes (8 digits or fewer) are used as-is - if (barcode.length <= 8) { - return barcode; - } - // Longer barcodes are padded to at least 13 digits and split - const code = barcode.padStart(13, '0'); - // Split into segments of 3 digits, except the last part - const segments: string[] = []; - for (let i = 0; i < code.length - 4; i += 3) { - segments.push(code.slice(i, i + 3)); - } - segments.push(code.slice(code.length - 4)); // Last 4 digits - return segments.join('/'); -} - -function extractDisplayImageUrls(images: any, barcode: string): Image[] { +/** + * Extract image metadata from Open Food Facts product data. + * Stores only language keys (front_en, ingredients_fr, etc.) - drops numeric keys. + * URL construction happens at runtime in server code. + * + * RUNTIME URL CONSTRUCTION (for server code): + * + * function barcodeToPath(barcode: string): string { + * if (barcode.length <= 8) return barcode; + * const code = barcode.padStart(13, '0'); + * const segments: string[] = []; + * for (let i = 0; i < code.length - 4; i += 3) { + * segments.push(code.slice(i, i + 3)); + * } + * segments.push(code.slice(code.length - 4)); + * return segments.join('/'); + * } + * + * function constructImageUrl(barcode: string, imgId: string, size?: '400' | '200' | '100'): string { + * const path = barcodeToPath(barcode); + * const sizeStr = size ? `.${size}` : ''; + * return `https://static.openfoodfacts.org/images/products/${path}/${imgId}${sizeStr}.jpg`; + * } + */ +function extractDisplayImageUrls(images: any, _barcode: string): ImageMetadata[] { if (!images || typeof images !== "object") { return []; } - const urls: Image[] = []; - const processedImages = new Set(); // Track processed image IDs to avoid duplicates - const barcodePath = barcodeToPath(barcode); + const metadata: ImageMetadata[] = []; try { - // Open Food Facts image structure: images contains both numeric keys (1,2,3,4) and language-specific keys (front_en, ingredients_fr, etc.) - // Language-specific keys reference numeric images via imgid + // Image types we care about + const imageTypes = ['front', 'ingredients', 'nutrition', 'packaging']; - // First, collect all language-specific front images - const languageKeys = ['front_en', 'front_fr', 'front_de', 'front_es', 'front_it', 'front_pt', 'front_nl', 'front_sv', 'front_da', 'front_no', 'front_fi']; + // Common languages (ordered by priority) + const languages = ['en', 'fr', 'de', 'es', 'it', 'pt', 'nl', 'pl', 'ru', 'ja', 'zh', 'sv', 'da', 'no', 'fi']; - for (const langKey of languageKeys) { - const imageRef = images[langKey]; - if (imageRef && typeof imageRef === "object" && imageRef.imgid) { - const imgId = String(imageRef.imgid); // Ensure imgid is a string - if (processedImages.has(imgId)) continue; // Skip if already processed + // Extract all language-specific keys for each image type + for (const imageType of imageTypes) { + for (const lang of languages) { + const key = `${imageType}_${lang}`; + const imageRef = images[key]; - const imageData = images[imgId]; - if (imageData && typeof imageData === "object" && imageData.sizes) { - const sizes = imageData.sizes; - - // Collect all available sizes for this image, grouped by resolution - const imageUrls: { url: string; resolution: string; width: number; height: number }[] = []; - - if (sizes.full && sizes.full.w && sizes.full.h) { - const url = `https://static.openfoodfacts.org/images/products/${barcodePath}/${imgId}.jpg`; - if (isValidImageUrl(url)) { - imageUrls.push({ url, resolution: 'full', width: sizes.full.w, height: sizes.full.h }); - } - } - - if (sizes["400"] && sizes["400"].w && sizes["400"].h) { - const url = `https://static.openfoodfacts.org/images/products/${barcodePath}/${imgId}.400.jpg`; - if (isValidImageUrl(url)) { - imageUrls.push({ url, resolution: '400px', width: sizes["400"].w, height: sizes["400"].h }); - } - } + if (imageRef && typeof imageRef === "object" && imageRef.imgid && imageRef.sizes) { + // Collect which sizes are available + const availableSizes: ('full' | '400' | '200' | '100')[] = []; + if (imageRef.sizes.full) availableSizes.push('full'); + if (imageRef.sizes["400"]) availableSizes.push('400'); + if (imageRef.sizes["200"]) availableSizes.push('200'); + if (imageRef.sizes["100"]) availableSizes.push('100'); - if (sizes["200"] && sizes["200"].w && sizes["200"].h) { - const url = `https://static.openfoodfacts.org/images/products/${barcodePath}/${imgId}.200.jpg`; - if (isValidImageUrl(url)) { - imageUrls.push({ url, resolution: '200px', width: sizes["200"].w, height: sizes["200"].h }); - } - } - - if (sizes["100"] && sizes["100"].w && sizes["100"].h) { - const url = `https://static.openfoodfacts.org/images/products/${barcodePath}/${imgId}.100.jpg`; - if (isValidImageUrl(url)) { - imageUrls.push({ url, resolution: '100px', width: sizes["100"].w, height: sizes["100"].h }); - } - } - - // Add all valid URLs for this image - for (const img of imageUrls) { - urls.push({ - url: img.url, - resolution: img.resolution, - width: img.width, - height: img.height + if (availableSizes.length > 0) { + metadata.push({ + type: imageType as 'front' | 'ingredients' | 'nutrition' | 'packaging', + language: lang, + imgid: String(imageRef.imgid), + sizes: availableSizes }); } - - processedImages.add(imgId); - } - } - } - - // If no language-specific front images found, try any numeric image - if (urls.length === 0) { - for (const [key, imageData] of Object.entries(images)) { - if (/^\d+$/.test(key) && imageData && typeof imageData === "object" && (imageData as any).sizes) { - const sizes = (imageData as any).sizes; - - // Collect all available sizes for this image - if (sizes.full && sizes.full.w && sizes.full.h) { - const url = `https://static.openfoodfacts.org/images/products/${barcodePath}/${key}.jpg`; - if (isValidImageUrl(url)) { - urls.push({ - url, - resolution: 'full', - width: sizes.full.w, - height: sizes.full.h - }); - } - } - - if (sizes["400"] && sizes["400"].w && sizes["400"].h) { - const url = `https://static.openfoodfacts.org/images/products/${barcodePath}/${key}.400.jpg`; - if (isValidImageUrl(url)) { - urls.push({ - url, - resolution: '400px', - width: sizes["400"].w, - height: sizes["400"].h - }); - } - } - - if (urls.length > 0) break; // Stop after finding the first valid image } } } } catch (_error) { - // ignore malformed structures + // Ignore malformed structures } - return urls; + return metadata; } function mapToCacheRow(product: any): CacheRow | null {