diff --git a/README.md b/README.md index 99bc43f..303229b 100644 --- a/README.md +++ b/README.md @@ -197,6 +197,39 @@ pf: My API is at http://localhost:8080/chat, POST with JSON { "message": "the pr File attachments (API specs, curl commands) are also supported. Results are posted back to the Slack thread as downloadable files. +### Tax Organizer (`crab tax`) + +Tax document organizer and deterministic filing handoff generator for supported 2025 federal + California personal return scenarios. + +```bash +crab tax install # Install the plugin +crab tax ./my-tax-docs # Process a folder of tax documents +crab tax ./my-tax-docs --output ./tax-output # Write outputs to a directory +crab tax ./my-tax-docs --profile ./profile.json +crab tax uninstall # Remove the plugin +``` + +**Supported inputs today:** `W-2`, `1099-INT`, `1099-DIV`, `1098`, `1099-B`, `1099-R`, `5498`, `1099-composite`, `property-tax-bill` + +**Outputs:** +- `taxpayer_profile.json` +- `documents.json` +- `reconciliation.json` +- `issues_to_review.json` +- `federal_return_inputs.json` +- `ca_return_inputs.json` +- `estimate_summary.json` +- `turbotax_handoff.md` + +See [plugins/tax/README.md](plugins/tax/README.md) for plugin-specific details. + +**Modes:** +- Mock extraction via `.mock.json` sidecars for deterministic fixture testing +- Live OpenAI extraction for supported PDFs/images when `OPENAI_API_KEY` is set +- Bounded agent research for unknown or unsupported forms using tool calls plus official-source web search + +**Current scope:** single or MFJ, full-year California resident, no dependent-related federal credits, no RSU/ESPP handling, deterministic estimation for supported scenarios only + ### Excalidraw Whiteboard (`crab draw`) Collaborative whiteboarding with real-time collab via Excalidraw. diff --git a/plugins/tax/.gitignore b/plugins/tax/.gitignore new file mode 100644 index 0000000..b947077 --- /dev/null +++ b/plugins/tax/.gitignore @@ -0,0 +1,2 @@ +node_modules/ +dist/ diff --git a/plugins/tax/README.md b/plugins/tax/README.md new file mode 100644 index 0000000..78415e2 --- /dev/null +++ b/plugins/tax/README.md @@ -0,0 +1,99 @@ +# Crab Tax Plugin + +`crab tax` organizes tax documents, extracts supported fields, computes deterministic 2025 federal and California estimates for supported scenarios, and writes a TurboTax-oriented handoff packet. + +## Installation + +```bash +crab tax install +``` + +## Usage + +```bash +crab tax ./my-tax-docs +crab tax ./my-tax-docs --output ./tax-output +crab tax ./my-tax-docs --profile ./profile.json +crab tax uninstall +``` + +## Outputs + +The plugin writes: + +```text +tax-output/ +├── taxpayer_profile.json +├── documents.json +├── extracted/ +├── reconciliation.json +├── issues_to_review.json +├── federal_return_inputs.json +├── ca_return_inputs.json +├── estimate_summary.json +└── turbotax_handoff.md +``` + +## Supported Inputs + +- `W-2` +- `1099-INT` +- `1099-DIV` +- `1098` +- `1099-B` +- `1099-R` +- `5498` +- `1099-composite` +- `property-tax-bill` + +## Extraction Modes + +### Mock Sidecars + +For deterministic local tests, place a `.mock.json` file beside the document: + +```text +w2-2025.pdf +w2-2025.pdf.mock.json +``` + +### Deterministic Parsers + +The plugin prefers deterministic extraction for supported document layouts such as composite brokerage statements and property tax bills. + +### Live OpenAI Extraction + +If no mock sidecar is present and deterministic parsing does not apply, the plugin attempts live extraction for supported PDFs and images when `OPENAI_API_KEY` is set. The current default model is `gpt-5.4`. + +## Agent Research Loop + +For unknown or unsupported tax forms, the plugin runs a bounded agent research pass: + +- the agent inspects the unknown document inventory +- it uses tool calls to perform official-source research +- it records a handling strategy +- unsupported forms remain blocking unless deterministic handling exists + +This is intentionally different from letting an agent improvise tax math. The deterministic engine still owns reconciliation and final computations. + +## Current Supported Scenario + +The deterministic estimation path currently targets: + +- 2025 tax year +- `single` or `mfj` +- California full-year resident +- no dependent-related federal credits +- no RSU / ESPP / inherited-share handling +- no Schedule C, rental, or K-1 support + +Unsupported scenarios are surfaced as blocking issues rather than silently guessed. + +## Development + +Run fixture-based end-to-end tests: + +```bash +cd plugins/tax +npm test +``` diff --git a/plugins/tax/bin/crab-tax.js b/plugins/tax/bin/crab-tax.js new file mode 100644 index 0000000..21e8382 --- /dev/null +++ b/plugins/tax/bin/crab-tax.js @@ -0,0 +1,2 @@ +#!/usr/bin/env node +import('../dist/cli.js'); diff --git a/plugins/tax/package-lock.json b/plugins/tax/package-lock.json new file mode 100644 index 0000000..4da6802 --- /dev/null +++ b/plugins/tax/package-lock.json @@ -0,0 +1,53 @@ +{ + "name": "@crabcode/tax", + "version": "0.1.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "@crabcode/tax", + "version": "0.1.0", + "bin": { + "crab-tax": "bin/crab-tax.js" + }, + "devDependencies": { + "@types/node": "^22.0.0", + "typescript": "^5.7.0" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/@types/node": { + "version": "22.19.15", + "resolved": "https://registry.npmjs.org/@types/node/-/node-22.19.15.tgz", + "integrity": "sha512-F0R/h2+dsy5wJAUe3tAU6oqa2qbWY5TpNfL/RGmo1y38hiyO1w3x2jPtt76wmuaJI4DQnOBu21cNXQ2STIUUWg==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "dev": true, + "license": "MIT" + } + } +} diff --git a/plugins/tax/package.json b/plugins/tax/package.json new file mode 100644 index 0000000..8a762ea --- /dev/null +++ b/plugins/tax/package.json @@ -0,0 +1,23 @@ +{ + "name": "@crabcode/tax", + "version": "0.1.0", + "description": "Tax document organizer and filing handoff plugin for crabcode", + "type": "module", + "main": "dist/index.js", + "bin": { + "crab-tax": "./bin/crab-tax.js" + }, + "scripts": { + "build": "tsc", + "dev": "tsc --watch", + "test": "node test/e2e.mjs && node test/classify.mjs && node test/deterministic.mjs" + }, + "dependencies": {}, + "devDependencies": { + "@types/node": "^22.0.0", + "typescript": "^5.7.0" + }, + "engines": { + "node": ">=20.0.0" + } +} diff --git a/plugins/tax/src/agent/research-loop.ts b/plugins/tax/src/agent/research-loop.ts new file mode 100644 index 0000000..a8e648c --- /dev/null +++ b/plugins/tax/src/agent/research-loop.ts @@ -0,0 +1,294 @@ +import type { DocumentInventoryItem, ReviewIssue, TaxpayerProfile } from '../types.js'; + +const OPENAI_API_URL = 'https://api.openai.com/v1/responses'; +const DEFAULT_AGENT_MODEL = process.env.CRAB_TAX_AGENT_MODEL || 'gpt-5.4'; + +interface ResearchStrategy { + docId: string; + formTypeGuess: string; + recommendedHandling: string; + confidence: 'high' | 'medium' | 'low'; + blocking: boolean; + sourceSummary: string; +} + +interface AgentResult { + summary: string; + strategies: ResearchStrategy[]; +} + +export async function runResearchAgent(args: { + profile: TaxpayerProfile; + documents: DocumentInventoryItem[]; + existingIssues: ReviewIssue[]; + onProgress?: (message: string) => void; +}): Promise { + const apiKey = process.env.OPENAI_API_KEY; + if (!apiKey) { + args.onProgress?.('Research agent skipped because OPENAI_API_KEY is not set.'); + return null; + } + + const targetDocuments = args.documents.filter( + (document) => + document.detectedFormType === 'unknown' || document.extractionStatus === 'unsupported' + ); + if (targetDocuments.length === 0) { + args.onProgress?.('Research agent skipped because there are no unknown or unsupported documents.'); + return null; + } + + args.onProgress?.(`Research agent starting for ${targetDocuments.length} document(s).`); + + const tools = [ + { + type: 'function', + name: 'research_authority', + description: + 'Research unknown tax documents using official IRS and California FTB sources only.', + parameters: { + type: 'object', + properties: { + query: { + type: 'string', + description: 'A focused research query about the unknown or unsupported tax form.', + }, + }, + required: ['query'], + additionalProperties: false, + }, + strict: true, + }, + { + type: 'function', + name: 'record_strategy', + description: + 'Record a handling strategy for a specific document after researching authoritative sources.', + parameters: { + type: 'object', + properties: { + docId: { type: 'string' }, + formTypeGuess: { type: 'string' }, + recommendedHandling: { type: 'string' }, + confidence: { type: 'string', enum: ['high', 'medium', 'low'] }, + blocking: { type: 'boolean' }, + sourceSummary: { type: 'string' }, + }, + required: [ + 'docId', + 'formTypeGuess', + 'recommendedHandling', + 'confidence', + 'blocking', + 'sourceSummary', + ], + additionalProperties: false, + }, + strict: true, + }, + { + type: 'function', + name: 'done', + description: 'Finish the document research pass once all target documents have strategies.', + parameters: { + type: 'object', + properties: { + summary: { + type: 'string', + description: 'Short summary of the research results.', + }, + }, + required: ['summary'], + additionalProperties: false, + }, + strict: true, + }, + ]; + + const strategies: ResearchStrategy[] = []; + let summary = ''; + let input = [ + { + role: 'system', + content: + 'You are a bounded tax document research agent. Your job is to research unknown or unsupported tax documents using official IRS and California FTB sources only, then propose handling strategies. You must not invent tax law. Use research_authority before record_strategy when the form is unknown. Mark blocking=true when deterministic tax computation should not proceed without new code or human review.', + }, + { + role: 'user', + content: buildAgentContext(args.profile, targetDocuments, args.existingIssues), + }, + ] as Array>; + + for (let turn = 0; turn < 8; turn++) { + args.onProgress?.(`Research agent turn ${turn + 1}: requesting next action.`); + const response = await fetchWithTimeout(OPENAI_API_URL, { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + model: DEFAULT_AGENT_MODEL, + input, + tools, + tool_choice: 'auto', + }), + }, 45000); + + if (!response.ok) { + throw new Error(`Research agent failed: ${response.status} ${await response.text()}`); + } + + const payload = (await response.json()) as { + output?: Array<{ + type?: string; + name?: string; + call_id?: string; + arguments?: string; + }>; + output_text?: string; + }; + + const toolCalls = (payload.output || []).filter((item) => item.type === 'function_call'); + args.onProgress?.( + `Research agent turn ${turn + 1}: received ${toolCalls.length} tool call(s).` + ); + if (toolCalls.length === 0) { + if (payload.output_text) { + summary = payload.output_text; + args.onProgress?.(`Research agent turn ${turn + 1}: received final text response.`); + } + break; + } + + const outputs: Array> = []; + + for (const toolCall of toolCalls) { + const argsJson = JSON.parse(toolCall.arguments || '{}') as Record; + if (toolCall.name === 'research_authority') { + const query = String(argsJson.query || ''); + args.onProgress?.(`Research agent: researching authority for query "${query}".`); + const result = await researchAuthority(query); + outputs.push({ + type: 'function_call_output', + call_id: toolCall.call_id, + output: JSON.stringify(result), + }); + } else if (toolCall.name === 'record_strategy') { + args.onProgress?.( + `Research agent: recorded strategy for ${String(argsJson.docId || 'unknown-doc')}.` + ); + strategies.push({ + docId: String(argsJson.docId || ''), + formTypeGuess: String(argsJson.formTypeGuess || ''), + recommendedHandling: String(argsJson.recommendedHandling || ''), + confidence: + argsJson.confidence === 'low' || argsJson.confidence === 'medium' + ? argsJson.confidence + : 'high', + blocking: Boolean(argsJson.blocking), + sourceSummary: String(argsJson.sourceSummary || ''), + }); + outputs.push({ + type: 'function_call_output', + call_id: toolCall.call_id, + output: 'recorded', + }); + } else if (toolCall.name === 'done') { + summary = String(argsJson.summary || ''); + args.onProgress?.('Research agent: marked the research pass as done.'); + outputs.push({ + type: 'function_call_output', + call_id: toolCall.call_id, + output: 'done', + }); + } + } + + input = [...input, ...(payload.output || []), ...outputs]; + if (summary) { + break; + } + } + + return { + summary, + strategies, + }; +} + +async function researchAuthority(query: string): Promise<{ summary: string }> { + const apiKey = process.env.OPENAI_API_KEY; + if (!apiKey) { + return { summary: 'No OpenAI API key available for authority research.' }; + } + + const response = await fetchWithTimeout(OPENAI_API_URL, { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + model: DEFAULT_AGENT_MODEL, + tools: [{ type: 'web_search' }], + input: `Research this tax question using official IRS and California FTB sources only. If the document is not California-specific, prefer IRS sources. Summarize the likely form type, what schedule or return area it affects, and whether deterministic code should block until dedicated handling exists.\n\nQuery: ${query}`, + }), + }, 45000); + + if (!response.ok) { + throw new Error(`Authority research failed: ${response.status} ${await response.text()}`); + } + + const payload = (await response.json()) as { output_text?: string }; + return { + summary: payload.output_text || '', + }; +} + +async function fetchWithTimeout( + url: string, + init: RequestInit, + timeoutMs: number +): Promise { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + try { + return await fetch(url, { + ...init, + signal: controller.signal, + }); + } finally { + clearTimeout(timeout); + } +} + +function buildAgentContext( + profile: TaxpayerProfile, + documents: DocumentInventoryItem[], + issues: ReviewIssue[] +): string { + const documentSummary = documents + .map( + (document) => + `- ${document.id}: ${document.fileName} (detected=${document.detectedFormType}, extractionStatus=${document.extractionStatus})` + ) + .join('\n'); + const issueSummary = issues + .map((issue) => `- ${issue.code}: ${issue.message}`) + .join('\n'); + + return [ + `Tax year: ${profile.taxYear}`, + `Filing status: ${profile.filingStatus}`, + 'Research these documents and propose handling strategies.', + '', + 'Documents:', + documentSummary || '- none', + '', + 'Existing issues:', + issueSummary || '- none', + '', + 'For each target document, record a strategy.', + ].join('\n'); +} diff --git a/plugins/tax/src/app/output.ts b/plugins/tax/src/app/output.ts new file mode 100644 index 0000000..9ff54f4 --- /dev/null +++ b/plugins/tax/src/app/output.ts @@ -0,0 +1,21 @@ +import * as fs from 'node:fs'; +import * as path from 'node:path'; + +export function ensureDir(dirPath: string) { + fs.mkdirSync(dirPath, { recursive: true }); +} + +export function writeJson(filePath: string, data: unknown) { + ensureDir(path.dirname(filePath)); + fs.writeFileSync(filePath, `${JSON.stringify(data, null, 2)}\n`, 'utf-8'); +} + +export function writeText(filePath: string, content: string) { + ensureDir(path.dirname(filePath)); + fs.writeFileSync(filePath, content, 'utf-8'); +} + +export function appendText(filePath: string, content: string) { + ensureDir(path.dirname(filePath)); + fs.appendFileSync(filePath, content, 'utf-8'); +} diff --git a/plugins/tax/src/app/run-pipeline.ts b/plugins/tax/src/app/run-pipeline.ts new file mode 100644 index 0000000..eafa5fd --- /dev/null +++ b/plugins/tax/src/app/run-pipeline.ts @@ -0,0 +1,1196 @@ +import { execFileSync } from 'node:child_process'; +import * as crypto from 'node:crypto'; +import * as path from 'node:path'; +import { appendText, writeJson, writeText, ensureDir } from './output.js'; +import { listInputFiles } from '../ingestion/list-input-files.js'; +import { classifyDocument } from '../ingestion/classify-document.js'; +import { canExtractMockDocumentType, extractMockDocument } from '../extraction/mock.js'; +import { + canExtractDeterministicDocumentType, + extractDeterministicDocument, +} from '../extraction/deterministic.js'; +import { + canExtractLiveDocumentType, + extractLiveDocument, + reviewLiveDocumentTaxYear, +} from '../extraction/live-openai.js'; +import { buildAggregatedPreview } from '../compute/aggregate-preview.js'; +import { computeFederalReturn } from '../compute/federal-return.js'; +import { computeCaliforniaReturn } from '../compute/california-return.js'; +import { runResearchAgent } from '../agent/research-loop.js'; +import type { + DocumentType, + DocumentInventoryItem, + EstimateSummary, + ExtractedDocument, + ReconciliationReport, + ReviewIssue, + RunPipelineOptions, + RunPipelineResult, +} from '../types.js'; + +const SCHEMA_VERSION = '0.1.0'; + +export async function runPipeline(options: RunPipelineOptions): Promise { + ensureDir(options.outputDir); + ensureDir(path.join(options.outputDir, 'extracted')); + + const progressLogPath = path.join(options.outputDir, 'progress.log'); + const progressJsonPath = path.join(options.outputDir, 'progress.json'); + const progress = { + schemaVersion: SCHEMA_VERSION, + startedAt: new Date().toISOString(), + updatedAt: new Date().toISOString(), + status: 'running' as 'running' | 'completed' | 'failed', + phase: 'setup', + completedSteps: [] as string[], + currentDocument: null as null | { + id: string; + fileName: string; + detectedFormType: string; + index: number; + total: number; + step: string; + }, + documentCounts: { + total: 0, + extracted: 0, + pending: 0, + }, + issueCounts: { + total: 0, + blocking: 0, + }, + latestNote: 'Run initialized.', + }; + + writeText(progressLogPath, ''); + writeJson(progressJsonPath, progress); + + function updateProgress(args: { + note: string; + phase?: string; + completedStep?: string; + currentDocument?: typeof progress.currentDocument; + status?: typeof progress.status; + }) { + progress.updatedAt = new Date().toISOString(); + progress.phase = args.phase || progress.phase; + progress.status = args.status || progress.status; + progress.currentDocument = + args.currentDocument === undefined ? progress.currentDocument : args.currentDocument; + progress.documentCounts.total = documents.length; + progress.documentCounts.extracted = extractedDocuments.length; + progress.documentCounts.pending = documents.filter( + (document) => document.extractionStatus === 'pending' + ).length; + progress.issueCounts.total = issues.length; + progress.issueCounts.blocking = issues.filter((issue) => issue.severity === 'blocking').length; + progress.latestNote = args.note; + if (args.completedStep && !progress.completedSteps.includes(args.completedStep)) { + progress.completedSteps.push(args.completedStep); + } + + appendText(progressLogPath, `[${progress.updatedAt}] ${args.note}\n`); + writeJson(progressJsonPath, progress); + if (options.verbose) { + console.log(`[progress] ${args.note}`); + } + } + + const files = listInputFiles(options.inputDir); + const issues: ReviewIssue[] = []; + const extractedDocuments: ExtractedDocument[] = []; + const documents: DocumentInventoryItem[] = []; + + updateProgress({ + phase: 'inventory', + note: `Discovered ${files.length} candidate file(s) in ${options.inputDir}.`, + }); + + try { + for (const [index, filePath] of files.entries()) { + const classification = classifyDocument(filePath); + const relativePath = path.relative(options.inputDir, filePath); + + updateProgress({ + phase: 'inventory', + currentDocument: { + id: `doc-${index + 1}`, + fileName: path.basename(filePath), + detectedFormType: classification.detectedFormType, + index: index + 1, + total: files.length, + step: 'classified', + }, + note: `Classified ${path.basename(filePath)} as ${classification.detectedFormType}.`, + }); + + if (classification.detectedFormType === 'unknown') { + issues.push({ + schemaVersion: SCHEMA_VERSION, + severity: 'blocking', + code: 'UNKNOWN_DOCUMENT_TYPE', + message: `Could not confidently classify ${relativePath}.`, + impactedArea: 'document-inventory', + sourceReferences: [relativePath], + suggestedNextAction: + 'Review the document classification before trusting the estimate. Material unknown documents must be resolved.', + }); + } + + documents.push({ + schemaVersion: SCHEMA_VERSION, + id: `doc-${index + 1}`, + filePath: relativePath, + fileName: path.basename(filePath), + detectedFormType: classification.detectedFormType, + issuerOrPayer: null, + taxYear: inferTaxYear(relativePath), + pageCount: null, + extractionStatus: 'pending', + confidence: classification.confidence, + }); + } + + updateProgress({ + phase: 'inventory', + completedStep: 'inventory', + currentDocument: null, + note: `Inventory complete. ${documents.length} document(s) queued for extraction.`, + }); + + for (const [index, document] of documents.entries()) { + updateProgress({ + phase: 'extraction', + currentDocument: { + id: document.id, + fileName: document.fileName, + detectedFormType: document.detectedFormType, + index: index + 1, + total: documents.length, + step: 'extracting', + }, + note: `Starting extraction ${index + 1}/${documents.length}: ${document.fileName} (${document.detectedFormType}).`, + }); + + const canUseMock = canExtractMockDocumentType(document.detectedFormType); + const canUseDeterministic = canExtractDeterministicDocumentType(document.detectedFormType); + const canUseLive = canExtractLiveDocumentType(document.detectedFormType); + let extracted = canUseMock + ? extractMockDocument({ + inputDir: options.inputDir, + document, + }) + : null; + + if (extracted) { + updateProgress({ + phase: 'extraction', + currentDocument: { + id: document.id, + fileName: document.fileName, + detectedFormType: document.detectedFormType, + index: index + 1, + total: documents.length, + step: 'using-mock', + }, + note: `Using mock extraction sidecar for ${document.fileName}.`, + }); + } + + if (!extracted && canUseDeterministic) { + updateProgress({ + phase: 'extraction', + currentDocument: { + id: document.id, + fileName: document.fileName, + detectedFormType: document.detectedFormType, + index: index + 1, + total: documents.length, + step: 'deterministic', + }, + note: `Running deterministic text extraction for ${document.fileName}.`, + }); + extracted = extractDeterministicDocument({ + inputDir: options.inputDir, + document, + }); + } + + if (!extracted && canUseLive) { + try { + updateProgress({ + phase: 'extraction', + currentDocument: { + id: document.id, + fileName: document.fileName, + detectedFormType: document.detectedFormType, + index: index + 1, + total: documents.length, + step: 'live-openai', + }, + note: `Calling live extraction for ${document.fileName}.`, + }); + extracted = await extractLiveDocument({ + inputDir: options.inputDir, + document, + }); + } catch (error) { + issues.push({ + schemaVersion: SCHEMA_VERSION, + severity: 'blocking', + code: 'LIVE_EXTRACTION_FAILED', + message: `Live extraction failed for ${document.filePath}: ${(error as Error).message}`, + impactedArea: 'extraction', + sourceReferences: [document.filePath], + suggestedNextAction: 'Fix the document input, provide a mock sidecar, or retry live extraction.', + }); + updateProgress({ + phase: 'extraction', + currentDocument: { + id: document.id, + fileName: document.fileName, + detectedFormType: document.detectedFormType, + index: index + 1, + total: documents.length, + step: 'failed', + }, + note: `Live extraction failed for ${document.fileName}: ${(error as Error).message}`, + }); + } + } + + if (extracted) { + const taxYearReconciliation = await reconcileExtractedTaxYear({ + inputDir: options.inputDir, + document, + extracted, + targetTaxYear: options.profile.taxYear, + }); + extracted = taxYearReconciliation.extracted; + issues.push(...taxYearReconciliation.issues); + extractedDocuments.push(extracted); + document.extractionStatus = 'extracted'; + document.issuerOrPayer = extracted.payerOrIssuer; + document.taxYear = extracted.taxYear; + document.confidence = extracted.confidence; + writeJson(path.join(options.outputDir, 'extracted', `${document.id}.json`), extracted); + issues.push(...validateExtractedDocument(extracted, options.profile.taxYear)); + updateProgress({ + phase: 'extraction', + currentDocument: { + id: document.id, + fileName: document.fileName, + detectedFormType: document.detectedFormType, + index: index + 1, + total: documents.length, + step: 'extracted', + }, + note: `Extracted ${document.fileName} via ${extracted.extractionMethod}.`, + }); + continue; + } + + if (canUseMock || canUseDeterministic || canUseLive) { + document.extractionStatus = 'missing_mock_data'; + issues.push({ + schemaVersion: SCHEMA_VERSION, + severity: 'blocking', + code: 'SUPPORTED_DOCUMENT_NOT_EXTRACTED', + message: `Could not extract supported document ${document.filePath}.`, + impactedArea: 'extraction', + sourceReferences: [document.filePath], + suggestedNextAction: + 'Provide a mock sidecar, retry live extraction, or tighten the extraction logic before trusting this run.', + }); + updateProgress({ + phase: 'extraction', + currentDocument: { + id: document.id, + fileName: document.fileName, + detectedFormType: document.detectedFormType, + index: index + 1, + total: documents.length, + step: 'blocked', + }, + note: `No extraction path available for supported form ${document.fileName}.`, + }); + continue; + } + + document.extractionStatus = 'unsupported'; + issues.push({ + schemaVersion: SCHEMA_VERSION, + severity: unsupportedDocumentSeverity(document.detectedFormType), + code: 'DOCUMENT_TYPE_NOT_YET_SUPPORTED', + message: `${document.filePath} is not supported in this slice.`, + impactedArea: 'extraction', + sourceReferences: [document.filePath], + suggestedNextAction: + 'Treat this estimate as incomplete until deterministic support exists for this document type.', + }); + updateProgress({ + phase: 'extraction', + currentDocument: { + id: document.id, + fileName: document.fileName, + detectedFormType: document.detectedFormType, + index: index + 1, + total: documents.length, + step: 'unsupported', + }, + note: `Marked ${document.fileName} as unsupported for deterministic extraction.`, + }); + } + + updateProgress({ + phase: 'extraction', + completedStep: 'extraction', + currentDocument: null, + note: `Extraction complete. ${extractedDocuments.length}/${documents.length} document(s) extracted.`, + }); + + const missingDocuments = inferLikelyMissingDocuments(documents); + for (const missing of missingDocuments) { + issues.push({ + schemaVersion: SCHEMA_VERSION, + severity: 'warning', + code: 'LIKELY_MISSING_DOCUMENT', + message: `Expected but did not detect ${missing}.`, + impactedArea: 'document-inventory', + sourceReferences: [], + suggestedNextAction: `Add the ${missing} document if it applies to this return.`, + }); + } + + updateProgress({ + phase: 'research', + note: 'Starting unknown/unsupported document research pass.', + }); + let researchResult: Awaited> = null; + try { + researchResult = await runResearchAgent({ + profile: options.profile, + documents, + existingIssues: issues, + onProgress(message) { + updateProgress({ + phase: 'research', + note: message, + }); + }, + }); + } catch (error) { + issues.push({ + schemaVersion: SCHEMA_VERSION, + severity: 'warning', + code: 'RESEARCH_AGENT_FAILED', + message: `Research agent failed: ${(error as Error).message}`, + impactedArea: 'document-research', + sourceReferences: [], + suggestedNextAction: 'Review unknown documents manually or rerun after improving research instrumentation.', + }); + updateProgress({ + phase: 'research', + note: `Research agent failed: ${(error as Error).message}`, + }); + } + if (researchResult) { + for (const strategy of researchResult.strategies) { + issues.push({ + schemaVersion: SCHEMA_VERSION, + severity: strategy.blocking ? 'blocking' : 'warning', + code: 'AGENT_DOCUMENT_STRATEGY', + message: `Agent strategy for ${strategy.docId} (${strategy.formTypeGuess}): ${strategy.recommendedHandling}`, + impactedArea: 'document-research', + sourceReferences: [strategy.docId], + suggestedNextAction: strategy.sourceSummary || 'Review the researched strategy before filing.', + }); + } + updateProgress({ + phase: 'research', + completedStep: 'research', + note: `Research pass complete. Generated ${researchResult.strategies.length} strategy note(s).`, + }); + } else { + updateProgress({ + phase: 'research', + completedStep: 'research', + note: 'Research pass ended without strategy output.', + }); + } + + updateProgress({ + phase: 'preview', + note: 'Building aggregated preview from extracted documents.', + }); + const preview = buildAggregatedPreview({ + schemaVersion: SCHEMA_VERSION, + documents, + extractedDocuments, + }); + issues.push(...preview.issues); + issues.push(...collectComputationCoverageIssues(preview.federalReturnInputs)); + + issues.push(...collectUnsupportedScenarioIssues(options, documents, extractedDocuments)); + + const federalEstimatedPayments = roundMoney( + options.profile.estimatedPayments + .filter((payment) => payment.jurisdiction === 'federal') + .reduce((sum, payment) => sum + payment.amount, 0) + ); + const californiaEstimatedPayments = roundMoney( + options.profile.estimatedPayments + .filter((payment) => payment.jurisdiction === 'california') + .reduce((sum, payment) => sum + payment.amount, 0) + ); + const traditionalIraContributions = roundMoney( + Math.max( + options.profile.iraContributions + .filter((contribution) => contribution.accountType === 'traditional_ira') + .reduce((sum, contribution) => sum + contribution.amount, 0), + numericMetaValue(preview.federalReturnInputs.traditional_ira_contribution_preview) + ) + ); + + updateProgress({ + phase: 'compute_federal', + note: 'Computing deterministic federal return estimate.', + }); + const federalComputation = computeFederalReturn({ + filingStatus: options.profile.filingStatus, + wages: numericMetaValue(preview.federalReturnInputs.wages), + taxableInterest: numericMetaValue(preview.federalReturnInputs.taxable_interest), + ordinaryDividends: numericMetaValue(preview.federalReturnInputs.ordinary_dividends), + qualifiedDividends: numericMetaValue(preview.federalReturnInputs.qualified_dividends), + retirementTaxableAmount: numericMetaValue(preview.federalReturnInputs.retirement_taxable_amount), + capitalGainDistributions: numericMetaValue( + preview.federalReturnInputs.capital_gain_distributions + ), + shortTermNetGainLoss: numericMetaValue( + preview.federalReturnInputs.short_term_covered_net_gain_loss_preview + ), + longTermNetGainLoss: numericMetaValue( + preview.federalReturnInputs.long_term_covered_net_gain_loss_preview + ), + mortgageInterest: numericMetaValue(preview.federalReturnInputs.mortgage_interest_preview), + pointsPaid: numericMetaValue(preview.federalReturnInputs.points_paid_preview), + propertyTaxPaid: numericMetaValue(preview.federalReturnInputs.property_tax_preview), + stateIncomeTaxPaid: + numericMetaValue(preview.caReturnInputs.california_withholding) + californiaEstimatedPayments, + foreignTaxPaid: numericMetaValue(preview.federalReturnInputs.foreign_tax_paid), + federalWithholding: numericMetaValue(preview.federalReturnInputs.federal_withholding), + federalEstimatedPayments, + traditionalIraContributions, + section199aDividends: numericMetaValue(preview.federalReturnInputs.section_199a_dividends), + workplaceRetirementCovered: hasPositiveField(extractedDocuments, 'box12_code_d'), + }); + for (const message of federalComputation.issues) { + issues.push({ + schemaVersion: SCHEMA_VERSION, + severity: 'warning', + code: 'FEDERAL_COMPUTATION_WARNING', + message, + impactedArea: 'federal-computation', + sourceReferences: [], + suggestedNextAction: 'Review this assumption before filing.', + }); + } + + updateProgress({ + phase: 'compute_california', + note: 'Computing deterministic California return estimate.', + }); + const californiaComputation = computeCaliforniaReturn({ + filingStatus: options.profile.filingStatus, + federalAgi: federalComputation.agi, + californiaWages: numericMetaValue(preview.caReturnInputs.california_wages), + mortgageInterest: numericMetaValue(preview.federalReturnInputs.mortgage_interest_preview), + pointsPaid: numericMetaValue(preview.federalReturnInputs.points_paid_preview), + propertyTaxPaid: numericMetaValue(preview.federalReturnInputs.property_tax_preview), + californiaWithholding: numericMetaValue(preview.caReturnInputs.california_withholding), + californiaEstimatedPayments, + dependents: options.profile.dependents, + }); + for (const message of californiaComputation.issues) { + issues.push({ + schemaVersion: SCHEMA_VERSION, + severity: 'warning', + code: 'CALIFORNIA_COMPUTATION_WARNING', + message, + impactedArea: 'california-computation', + sourceReferences: [], + suggestedNextAction: 'Review this assumption before filing.', + }); + } + + updateProgress({ + phase: 'reconciliation', + note: 'Building reconciliation report and handoff artifacts.', + }); + const reconciliation: ReconciliationReport = { + schemaVersion: SCHEMA_VERSION, + taxYear: options.profile.taxYear, + confidence: issues.some((issue) => issue.severity === 'blocking') ? 'low' : 'medium', + checks: [ + { + name: 'document_inventory_completed', + status: 'pass', + message: `Discovered ${documents.length} supported input file(s).`, + }, + ...preview.reconciliationChecks, + { + name: 'supported_scenario_check', + status: issues.some((issue) => issue.code === 'UNSUPPORTED_SCENARIO') ? 'blocking' : 'pass', + message: issues.some((issue) => issue.code === 'UNSUPPORTED_SCENARIO') + ? 'One or more unsupported scenario flags were detected.' + : 'Current run stays within the supported deterministic scenario.', + }, + ], + missingDocumentsLikelyRequired: missingDocuments, + stageDecisionLog: [ + 'Completed document inventory.', + 'Ran mock, deterministic, and live extraction across the supported document set.', + 'Built deterministic preview aggregates for extracted documents.', + 'Computed deterministic federal and California estimates for the supported scenario.', + 'Material unsupported document types are treated as blocking until deterministic support exists.', + ], + }; + + const extractionBlocking = documents.length === 0; + if (extractionBlocking) { + issues.push({ + schemaVersion: SCHEMA_VERSION, + severity: 'blocking', + code: 'NO_INPUT_DOCUMENTS', + message: 'No supported tax documents were found in the input directory.', + impactedArea: 'document-inventory', + sourceReferences: [], + suggestedNextAction: 'Add PDFs or images for the relevant tax forms and rerun the command.', + }); + } else if (extractedDocuments.length === 0) { + issues.push({ + schemaVersion: SCHEMA_VERSION, + severity: 'blocking', + code: 'NO_DOCUMENTS_EXTRACTED', + message: 'No documents were successfully extracted in this run.', + impactedArea: 'pipeline', + sourceReferences: [], + suggestedNextAction: 'Add mock sidecars for supported documents or continue implementing extraction.', + }); + } + + const inputFingerprint = crypto + .createHash('sha256') + .update(JSON.stringify({ + files: documents.map((doc) => doc.filePath), + profile: options.profile, + })) + .digest('hex') + .slice(0, 16); + + const estimateSummary: EstimateSummary = { + schemaVersion: SCHEMA_VERSION, + taxYear: options.profile.taxYear, + generatedAt: new Date().toISOString(), + inputFingerprint, + confidence: issues.some((issue) => issue.severity === 'blocking') ? 'low' : 'medium', + federalRefundOrAmountOwed: { + value: federalComputation.refundOrAmountOwed, + derivationType: 'computed', + references: extractedDocuments.map((document) => document.documentId), + }, + caRefundOrAmountOwed: { + value: californiaComputation.refundOrAmountOwed, + derivationType: 'computed', + references: extractedDocuments.map((document) => document.documentId), + }, + deductionChoice: { + federal: federalComputation.deductionChoice, + california: californiaComputation.deductionChoice, + }, + optimizations: buildOptimizationNotes(federalComputation, californiaComputation), + blockingIssueCount: issues.filter((issue) => issue.severity === 'blocking').length, + }; + + const federalReturnInputs = { + schemaVersion: SCHEMA_VERSION, + taxYear: options.profile.taxYear, + generatedAt: new Date().toISOString(), + inputFingerprint, + confidence: estimateSummary.confidence, + ...preview.federalReturnInputs, + traditional_ira_deduction: { + value: federalComputation.traditionalIraDeduction, + derivationType: 'computed', + references: extractedDocuments.map((document) => document.documentId), + }, + section_199a_deduction: { + value: federalComputation.section199aDeduction, + derivationType: 'computed', + references: extractedDocuments.map((document) => document.documentId), + }, + agi: { + value: federalComputation.agi, + derivationType: 'computed', + references: extractedDocuments.map((document) => document.documentId), + }, + deduction_choice: federalComputation.deductionChoice, + chosen_deduction: { + value: + federalComputation.deductionChoice === 'itemized' + ? federalComputation.itemizedDeduction + : federalComputation.standardDeduction, + derivationType: 'computed', + references: extractedDocuments.map((document) => document.documentId), + }, + taxable_income: { + value: federalComputation.taxableIncome, + derivationType: 'computed', + references: extractedDocuments.map((document) => document.documentId), + }, + tax_before_payments: { + value: federalComputation.taxBeforePayments, + derivationType: 'computed', + references: extractedDocuments.map((document) => document.documentId), + }, + foreign_tax_credit_applied: { + value: federalComputation.foreignTaxCreditApplied, + derivationType: 'computed', + references: extractedDocuments.map((document) => document.documentId), + }, + federal_refund_or_amount_owed: { + value: federalComputation.refundOrAmountOwed, + derivationType: 'computed', + references: extractedDocuments.map((document) => document.documentId), + }, + status: 'computed_supported_scenario', + }; + + const caReturnInputs = { + schemaVersion: SCHEMA_VERSION, + taxYear: options.profile.taxYear, + generatedAt: new Date().toISOString(), + inputFingerprint, + confidence: estimateSummary.confidence, + ...preview.caReturnInputs, + california_agi: { + value: californiaComputation.californiaAgi, + derivationType: 'computed', + references: extractedDocuments.map((document) => document.documentId), + }, + deduction_choice: californiaComputation.deductionChoice, + chosen_deduction: { + value: + californiaComputation.deductionChoice === 'itemized' + ? californiaComputation.itemizedDeduction + : californiaComputation.standardDeduction, + derivationType: 'computed', + references: extractedDocuments.map((document) => document.documentId), + }, + taxable_income: { + value: californiaComputation.taxableIncome, + derivationType: 'computed', + references: extractedDocuments.map((document) => document.documentId), + }, + tax_after_credits: { + value: californiaComputation.taxAfterCredits, + derivationType: 'computed', + references: extractedDocuments.map((document) => document.documentId), + }, + california_refund_or_amount_owed: { + value: californiaComputation.refundOrAmountOwed, + derivationType: 'computed', + references: extractedDocuments.map((document) => document.documentId), + }, + status: 'computed_supported_scenario', + }; + + const handoff = renderHandoff({ + inputDir: options.inputDir, + profile: options.profile, + documents, + extractedDocuments, + federalReturnInputs, + caReturnInputs, + issues, + }); + + writeJson(path.join(options.outputDir, 'taxpayer_profile.json'), options.profile); + writeJson(path.join(options.outputDir, 'documents.json'), documents); + writeJson(path.join(options.outputDir, 'reconciliation.json'), reconciliation); + writeJson(path.join(options.outputDir, 'issues_to_review.json'), issues); + writeJson(path.join(options.outputDir, 'federal_return_inputs.json'), federalReturnInputs); + writeJson(path.join(options.outputDir, 'ca_return_inputs.json'), caReturnInputs); + writeJson(path.join(options.outputDir, 'estimate_summary.json'), estimateSummary); + writeText(path.join(options.outputDir, 'turbotax_handoff.md'), handoff); + + updateProgress({ + phase: 'done', + status: 'completed', + completedStep: 'artifacts', + currentDocument: null, + note: `Run complete. Wrote final outputs with ${issues.filter((issue) => issue.severity === 'blocking').length} blocking issue(s).`, + }); + + return { + exitCode: issues.some((issue) => issue.severity === 'blocking') ? 1 : 0, + outputDir: options.outputDir, + issues, + }; + } catch (error) { + updateProgress({ + phase: 'error', + status: 'failed', + currentDocument: progress.currentDocument, + note: `Run failed: ${(error as Error).message}`, + }); + throw error; + } +} + +function inferTaxYear(filePath: string): number | null { + const match = filePath.match(/\b(20\d{2})\b/); + if (!match) { + return null; + } + return Number(match[1]); +} + +function inferLikelyMissingDocuments(documents: DocumentInventoryItem[]): string[] { + const types = new Set(documents.map((doc) => doc.detectedFormType)); + const missing: string[] = []; + + if (!types.has('W-2')) { + missing.push('W-2'); + } + + return missing; +} + +function renderHandoff(args: { + inputDir: string; + profile: RunPipelineOptions['profile']; + documents: DocumentInventoryItem[]; + extractedDocuments: ExtractedDocument[]; + federalReturnInputs: Record; + caReturnInputs: Record; + issues: ReviewIssue[]; +}): string { + const lines: string[] = [ + '# TurboTax Handoff', + '', + 'This output is generated from deterministic computations for the currently supported scenario.', + '', + '## Taxpayer Profile', + `- Tax year: ${args.profile.taxYear}`, + `- Filing status: ${args.profile.filingStatus}`, + `- State: ${args.profile.state}`, + `- Full-year resident: ${args.profile.fullYearResident ? 'yes' : 'no'}`, + `- Dependents: ${args.profile.dependents}`, + '', + '## Document Inventory', + ]; + + if (args.issues.some((issue) => issue.severity === 'blocking')) { + lines.splice( + 4, + 0, + '**Status:** Incomplete estimate. One or more blocking issues remain, so these totals should not be used for filing as-is.', + '' + ); + } + + if (args.documents.length === 0) { + lines.push('- No supported documents found.'); + } else { + for (const document of args.documents) { + lines.push( + `- ${document.fileName}: ${document.detectedFormType} (confidence: ${document.confidence})` + ); + } + } + + lines.push('', '## W-2 Entries'); + lines.push(`- Wages: ${renderMetaValue(args.federalReturnInputs.wages)}`); + lines.push(`- Federal withholding: ${renderMetaValue(args.federalReturnInputs.federal_withholding)}`); + lines.push(`- California wages: ${renderMetaValue(args.caReturnInputs.california_wages)}`); + lines.push(`- California withholding: ${renderMetaValue(args.caReturnInputs.california_withholding)}`); + + lines.push('', '## Interest And Dividends'); + lines.push(`- Taxable interest: ${renderMetaValue(args.federalReturnInputs.taxable_interest)}`); + lines.push(`- Tax-exempt interest: ${renderMetaValue(args.federalReturnInputs.tax_exempt_interest)}`); + lines.push(`- Foreign tax paid: ${renderMetaValue(args.federalReturnInputs.foreign_tax_paid)}`); + lines.push(`- Ordinary dividends: ${renderMetaValue(args.federalReturnInputs.ordinary_dividends)}`); + lines.push(`- Qualified dividends: ${renderMetaValue(args.federalReturnInputs.qualified_dividends)}`); + lines.push(`- Section 199A dividends: ${renderMetaValue(args.federalReturnInputs.section_199a_dividends)}`); + lines.push( + `- Specified private activity bond interest: ${renderMetaValue(args.federalReturnInputs.specified_private_activity_bond_interest)}` + ); + lines.push( + `- Nondividend distributions: ${renderMetaValue(args.federalReturnInputs.nondividend_distributions)}` + ); + + lines.push('', '## Investments'); + lines.push(`- Short-term covered proceeds: ${renderMetaValue(args.federalReturnInputs.short_term_covered_proceeds)}`); + lines.push(`- Short-term covered basis: ${renderMetaValue(args.federalReturnInputs.short_term_covered_basis)}`); + lines.push( + `- Short-term wash sale adjustments: ${renderMetaValue(args.federalReturnInputs.short_term_wash_sale_adjustments)}` + ); + lines.push(`- Short-term covered net gain/loss preview: ${renderMetaValue(args.federalReturnInputs.short_term_covered_net_gain_loss_preview)}`); + lines.push(`- Long-term covered proceeds: ${renderMetaValue(args.federalReturnInputs.long_term_covered_proceeds)}`); + lines.push(`- Long-term covered basis: ${renderMetaValue(args.federalReturnInputs.long_term_covered_basis)}`); + lines.push( + `- Long-term wash sale adjustments: ${renderMetaValue(args.federalReturnInputs.long_term_wash_sale_adjustments)}` + ); + lines.push(`- Long-term covered net gain/loss preview: ${renderMetaValue(args.federalReturnInputs.long_term_covered_net_gain_loss_preview)}`); + lines.push(`- Capital gain distributions: ${renderMetaValue(args.federalReturnInputs.capital_gain_distributions)}`); + + lines.push('', '## Retirement And IRA'); + lines.push(`- Taxable retirement distributions: ${renderMetaValue(args.federalReturnInputs.retirement_taxable_amount)}`); + lines.push(`- Traditional IRA contribution preview: ${renderMetaValue(args.federalReturnInputs.traditional_ira_contribution_preview)}`); + lines.push(`- Traditional IRA deduction applied: ${renderMetaValue(args.federalReturnInputs.traditional_ira_deduction)}`); + lines.push(`- Section 199A deduction applied: ${renderMetaValue(args.federalReturnInputs.section_199a_deduction)}`); + + lines.push('', '## Mortgage And Property Tax'); + lines.push(`- Mortgage interest preview: ${renderMetaValue(args.federalReturnInputs.mortgage_interest_preview)}`); + lines.push(`- Property tax preview: ${renderMetaValue(args.federalReturnInputs.property_tax_preview)}`); + lines.push(`- Points paid preview: ${renderMetaValue(args.federalReturnInputs.points_paid_preview)}`); + + lines.push('', '## Estimated Outcome'); + lines.push(`- Federal deduction choice: ${String(args.federalReturnInputs.deduction_choice || 'n/a')}`); + lines.push(`- Federal AGI: ${renderMetaValue(args.federalReturnInputs.agi)}`); + lines.push(`- Federal taxable income: ${renderMetaValue(args.federalReturnInputs.taxable_income)}`); + lines.push( + `- Foreign tax credit applied in estimate: ${renderMetaValue(args.federalReturnInputs.foreign_tax_credit_applied)}` + ); + lines.push(`- Estimated federal refund/(owed): ${renderMetaValue(args.federalReturnInputs.federal_refund_or_amount_owed)}`); + lines.push(`- California deduction choice: ${String(args.caReturnInputs.deduction_choice || 'n/a')}`); + lines.push(`- California taxable income: ${renderMetaValue(args.caReturnInputs.taxable_income)}`); + lines.push(`- Estimated California refund/(owed): ${renderMetaValue(args.caReturnInputs.california_refund_or_amount_owed)}`); + + lines.push('', '## Review Issues'); + if (args.issues.length === 0) { + lines.push('- No current issues.'); + } else { + for (const issue of args.issues) { + lines.push(`- [${issue.severity}] ${issue.message} Next: ${issue.suggestedNextAction}`); + } + } + + lines.push('', '## Status', '- Deterministic estimation is implemented for the supported scenario. Live extraction is available for supported forms when mock sidecars are absent and `OPENAI_API_KEY` is set.'); + if (args.issues.some((issue) => issue.code === 'AGENT_DOCUMENT_STRATEGY')) { + lines.push('- Agent research strategies were generated for unknown or unsupported documents.'); + } + if (args.issues.some((issue) => issue.severity === 'blocking')) { + lines.push('- Blocking issues remain. The estimate is incomplete until those issues are resolved.'); + } + return `${lines.join('\n')}\n`; +} + +function renderMetaValue(value: unknown): string { + if (!value || typeof value !== 'object' || !('value' in value)) { + return 'n/a'; + } + const maybeValue = (value as { value: unknown }).value; + return maybeValue === null ? 'n/a' : String(maybeValue); +} + +function numericMetaValue(value: unknown): number { + if (!value || typeof value !== 'object' || !('value' in value)) { + return 0; + } + const raw = (value as { value: unknown }).value; + return typeof raw === 'number' ? raw : 0; +} + +function hasPositiveField(documents: ExtractedDocument[], fieldName: string): boolean { + return documents.some((document) => { + const raw = document.fields[fieldName]?.value; + return typeof raw === 'number' && raw > 0; + }); +} + +function collectUnsupportedScenarioIssues( + options: RunPipelineOptions, + documents: DocumentInventoryItem[], + extractedDocuments: ExtractedDocument[] +): ReviewIssue[] { + const issues: ReviewIssue[] = []; + + if (!options.profile.fullYearResident) { + issues.push(unsupportedIssue('Full-year California residency is required for the current deterministic flow.')); + } + if (options.profile.dependents > 0) { + issues.push( + unsupportedIssue('Federal dependent-related credits are not implemented in the current deterministic flow.') + ); + } + if ( + options.profile.scenarioFlags.rsu || + options.profile.scenarioFlags.espp || + options.profile.scenarioFlags.inheritedShares + ) { + issues.push( + unsupportedIssue('RSU, ESPP, or inherited-share scenarios are not implemented in the current deterministic flow.') + ); + } + return issues; +} + +function validateExtractedDocument( + document: ExtractedDocument, + targetTaxYear: number +): ReviewIssue[] { + const issues: ReviewIssue[] = []; + + if (document.taxYear !== null && document.taxYear !== targetTaxYear) { + issues.push({ + schemaVersion: SCHEMA_VERSION, + severity: 'blocking', + code: 'DOCUMENT_TAX_YEAR_MISMATCH', + message: `${document.documentType} ${document.documentId} appears to be for tax year ${document.taxYear}, expected ${targetTaxYear}.`, + impactedArea: 'extraction-validation', + sourceReferences: [document.documentId], + suggestedNextAction: + 'Verify whether the extractor picked up the form revision year instead of the reporting year, then correct the extraction before relying on this run.', + }); + } + + if (document.fields.has_unsupported_brokerage_rows?.value === true) { + issues.push({ + schemaVersion: SCHEMA_VERSION, + severity: 'blocking', + code: 'UNSUPPORTED_BROKERAGE_BASIS_CATEGORY', + message: `${document.documentType} ${document.documentId} includes noncovered or unknown-term brokerage rows that are not yet mapped into the deterministic return path.`, + impactedArea: 'extraction-validation', + sourceReferences: [document.documentId], + suggestedNextAction: + 'Review the brokerage statement manually or extend the parser to capture the unsupported basis categories before relying on this run.', + }); + } + + if ( + document.documentType === 'property-tax-bill' && + document.fields.property_tax_first_installment_paid?.value !== true + ) { + issues.push({ + schemaVersion: SCHEMA_VERSION, + severity: 'warning', + code: 'PROPERTY_TAX_PAYMENT_NOT_CONFIRMED', + message: `Property tax bill ${document.documentId} did not include clear evidence of payment during the tax year.`, + impactedArea: 'extraction-validation', + sourceReferences: [document.documentId], + suggestedNextAction: + 'Confirm the actual payment date and amount before claiming the property tax deduction.', + }); + } + + return issues; +} + +async function reconcileExtractedTaxYear(args: { + inputDir: string; + document: DocumentInventoryItem; + extracted: ExtractedDocument; + targetTaxYear: number; +}): Promise<{ + extracted: ExtractedDocument; + issues: ReviewIssue[]; +}> { + const sourceTextYear = detectSourceDocumentTaxYear(args.inputDir, args.document); + if ( + sourceTextYear !== null && + sourceTextYear === args.targetTaxYear && + args.extracted.taxYear !== sourceTextYear + ) { + return { + extracted: { + ...args.extracted, + taxYear: sourceTextYear, + }, + issues: [ + { + schemaVersion: SCHEMA_VERSION, + severity: 'warning', + code: 'DOCUMENT_TAX_YEAR_CORRECTED_FROM_SOURCE_TEXT', + message: `${args.document.fileName} extraction returned tax year ${args.extracted.taxYear}, but source text clearly indicates tax year ${sourceTextYear}; the pipeline corrected it.`, + impactedArea: 'extraction-validation', + sourceReferences: [args.document.filePath], + suggestedNextAction: + 'Review the corrected tax year if the form is unusual, but this no longer blocks the run.', + }, + ], + }; + } + + if ( + args.extracted.taxYear !== null && + args.extracted.taxYear !== args.targetTaxYear && + sourceTextYear === null + ) { + try { + const review = await reviewLiveDocumentTaxYear({ + inputDir: args.inputDir, + document: args.document, + }); + if (review && review.taxYear === args.targetTaxYear) { + return { + extracted: { + ...args.extracted, + taxYear: review.taxYear, + }, + issues: [ + { + schemaVersion: SCHEMA_VERSION, + severity: 'warning', + code: 'DOCUMENT_TAX_YEAR_CORRECTED_BY_LIVE_REVIEW', + message: `${args.document.fileName} extraction returned tax year ${args.extracted.taxYear}, but a targeted live review determined the reporting tax year is ${review.taxYear}.`, + impactedArea: 'extraction-validation', + sourceReferences: [args.document.filePath], + suggestedNextAction: + review.rationale || 'Review the live tax-year correction if the document is unusual.', + }, + ], + }; + } + } catch { + return { + extracted: args.extracted, + issues: [], + }; + } + } + + return { + extracted: args.extracted, + issues: [], + }; +} + +function detectSourceDocumentTaxYear( + inputDir: string, + document: DocumentInventoryItem +): number | null { + const absolutePath = path.join(inputDir, document.filePath); + if (path.extname(absolutePath).toLowerCase() !== '.pdf') { + return null; + } + + try { + const text = execFileSync('pdftotext', ['-f', '1', '-l', '2', absolutePath, '-'], { + encoding: 'utf-8', + stdio: ['ignore', 'pipe', 'ignore'], + maxBuffer: 1024 * 1024, + }); + const patterns = [ + /for calendar year\s*(20\d{2})/i, + /tax year[^0-9]{0,20}(20\d{2})/i, + /for tax year[^0-9]{0,20}(20\d{2})/i, + /(?:^|\s)(20\d{2})\s+tax reporting statement/i, + /(?:^|\s)(20\d{2})\s+1099-(?:div|int|b|misc|oid|r)\b/i, + ]; + for (const pattern of patterns) { + const match = text.match(pattern); + if (match) { + return Number(match[1]); + } + } + } catch { + return null; + } + + return null; +} + +function collectComputationCoverageIssues( + federalReturnInputs: Record +): ReviewIssue[] { + const issues: ReviewIssue[] = []; + + if (numericMetaValue(federalReturnInputs.nondividend_distributions) > 0) { + issues.push({ + schemaVersion: SCHEMA_VERSION, + severity: 'warning', + code: 'NONDIVIDEND_DISTRIBUTION_REVIEW', + message: + 'Nondividend distributions were detected. They usually reduce basis rather than current-year taxable income.', + impactedArea: 'federal-computation', + sourceReferences: [], + suggestedNextAction: + 'Enter the nondividend distribution values in filing software and confirm basis tracking for the affected holdings.', + }); + } + + return issues; +} + +function unsupportedDocumentSeverity(documentType: DocumentType): ReviewIssue['severity'] { + return isMaterialDocumentType(documentType) ? 'blocking' : 'warning'; +} + +function isMaterialDocumentType(documentType: DocumentType): boolean { + switch (documentType) { + case 'prior-year-return': + return false; + case 'unknown': + case 'W-2': + case '1099-INT': + case '1099-DIV': + case '1099-B': + case '1099-composite': + case '1098': + case '5498': + case '1099-R': + case 'property-tax-bill': + return true; + } +} + +function unsupportedIssue(message: string): ReviewIssue { + return { + schemaVersion: SCHEMA_VERSION, + severity: 'blocking', + code: 'UNSUPPORTED_SCENARIO', + message, + impactedArea: 'scenario-support', + sourceReferences: [], + suggestedNextAction: 'Narrow the scenario or implement the missing tax logic before relying on this result.', + }; +} + +function buildOptimizationNotes( + federalComputation: ReturnType, + californiaComputation: ReturnType +): string[] { + const notes: string[] = []; + + notes.push( + federalComputation.deductionChoice === 'itemized' + ? 'Federal itemized deduction beats the federal standard deduction for this input set.' + : 'Federal standard deduction beats or matches federal itemized deductions for this input set.' + ); + notes.push( + californiaComputation.deductionChoice === 'itemized' + ? 'California itemized deduction beats the California standard deduction for this input set.' + : 'California standard deduction beats or matches California itemized deductions for this input set.' + ); + if (federalComputation.traditionalIraDeduction > 0) { + notes.push(`Traditional IRA deduction applied: ${federalComputation.traditionalIraDeduction}.`); + } + if (federalComputation.section199aDeduction > 0) { + notes.push(`Section 199A deduction applied: ${federalComputation.section199aDeduction}.`); + } + if (federalComputation.foreignTaxCreditApplied > 0) { + notes.push( + `Simplified foreign tax credit applied in the estimate: ${federalComputation.foreignTaxCreditApplied}.` + ); + } + + return notes; +} + +function roundMoney(value: number): number { + return Math.round(value * 100) / 100; +} diff --git a/plugins/tax/src/cli.ts b/plugins/tax/src/cli.ts new file mode 100644 index 0000000..8425ced --- /dev/null +++ b/plugins/tax/src/cli.ts @@ -0,0 +1,148 @@ +#!/usr/bin/env node +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import { runPipeline } from './app/run-pipeline.js'; +import type { FilingStatus, TaxpayerProfile } from './types.js'; + +const args = process.argv.slice(2); + +async function main() { + const inputDir = getPositionalInputDir(); + if (!inputDir) { + printHelp(); + process.exit(1); + } + + const resolvedInputDir = path.resolve(inputDir); + if (!fs.existsSync(resolvedInputDir) || !fs.statSync(resolvedInputDir).isDirectory()) { + console.error(`Input directory not found: ${resolvedInputDir}`); + process.exit(1); + } + + const outputDir = path.resolve(getArg('--output') || './tax-bot-output'); + const profile = loadProfile(); + const verbose = hasFlag('--verbose') || hasFlag('-v'); + const preview = hasFlag('--preview'); + + console.log('crab-tax'); + console.log(`Input: ${resolvedInputDir}`); + console.log(`Output: ${outputDir}`); + console.log(`Tax year: ${profile.taxYear}`); + console.log(`Filing status: ${profile.filingStatus}`); + console.log(''); + + const result = await runPipeline({ + inputDir: resolvedInputDir, + outputDir, + profile, + preview, + verbose, + }); + + console.log('Outputs written:'); + console.log(` ${path.join(outputDir, 'taxpayer_profile.json')}`); + console.log(` ${path.join(outputDir, 'documents.json')}`); + console.log(` ${path.join(outputDir, 'reconciliation.json')}`); + console.log(` ${path.join(outputDir, 'issues_to_review.json')}`); + console.log(` ${path.join(outputDir, 'turbotax_handoff.md')}`); + console.log(''); + console.log(`Blocking issues: ${result.issues.filter((issue) => issue.severity === 'blocking').length}`); + + process.exit(result.exitCode); +} + +function loadProfile(): TaxpayerProfile { + const profilePath = getArg('--profile'); + if (profilePath) { + const resolvedProfile = path.resolve(profilePath); + return JSON.parse(fs.readFileSync(resolvedProfile, 'utf-8')) as TaxpayerProfile; + } + + const filingStatus = (getArg('--filing-status') || 'single') as FilingStatus; + if (filingStatus !== 'single' && filingStatus !== 'mfj') { + console.error(`Unsupported filing status: ${filingStatus}`); + process.exit(1); + } + + const taxYear = parseInt(getArg('--tax-year') || '2025', 10); + const dependents = parseInt(getArg('--dependents') || '0', 10); + + return { + schemaVersion: '0.1.0', + taxYear, + filingStatus, + state: 'CA', + fullYearResident: !hasFlag('--not-full-year-resident'), + dependents, + estimatedPayments: [], + iraContributions: [], + scenarioFlags: { + rsu: hasFlag('--rsu'), + espp: hasFlag('--espp'), + inheritedShares: hasFlag('--inherited-shares'), + }, + reviewAnswers: {}, + }; +} + +function getPositionalInputDir(): string | undefined { + const positional = args.filter((arg, index) => { + if (index > 0 && isFlag(args[index - 1])) { + return false; + } + return !isFlag(arg); + }); + return positional[0]; +} + +function getArg(flag: string): string | undefined { + const index = args.indexOf(flag); + if (index === -1) { + return undefined; + } + const value = args[index + 1]; + if (!value || isFlag(value)) { + return undefined; + } + return value; +} + +function hasFlag(flag: string): boolean { + return args.includes(flag); +} + +function isFlag(value: string): boolean { + return value.startsWith('-'); +} + +function printHelp() { + console.log(` +crab-tax - Tax document organizer and TurboTax handoff generator + +Usage: + crab tax ./my-tax-docs + crab tax ./my-tax-docs --output ./tax-bot-output + crab tax ./my-tax-docs --filing-status mfj --tax-year 2025 + crab tax ./my-tax-docs --profile ./profile.json + +Options: + --output Output directory (default: ./tax-bot-output) + --profile Path to taxpayer profile JSON + --tax-year Tax year (default: 2025) + --filing-status Filing status (default: single) + --dependents Number of dependents (default: 0) + --preview Allow preview-mode outputs + --verbose, -v Show additional output + +Scenario flags: + --rsu + --espp + --inherited-shares + --not-full-year-resident +`); +} + +main().catch((error) => { + console.error('Error:', (error as Error).message); + process.exit(1); +}); diff --git a/plugins/tax/src/compute/aggregate-preview.ts b/plugins/tax/src/compute/aggregate-preview.ts new file mode 100644 index 0000000..072b485 --- /dev/null +++ b/plugins/tax/src/compute/aggregate-preview.ts @@ -0,0 +1,399 @@ +import type { DocumentInventoryItem, ExtractedDocument, ReviewIssue, ValueWithMeta } from '../types.js'; + +export interface AggregatedPreview { + federalReturnInputs: Record; + caReturnInputs: Record; + estimateSummary: { + confidence: 'low' | 'medium' | 'high'; + federalRefundOrAmountOwed: ValueWithMeta; + caRefundOrAmountOwed: ValueWithMeta; + deductionChoice: { + federal: 'unknown'; + california: 'unknown'; + }; + optimizations: string[]; + }; + reconciliationChecks: Array<{ + name: string; + status: 'pass' | 'warning' | 'blocking'; + message: string; + }>; + issues: ReviewIssue[]; +} + +export function buildAggregatedPreview(args: { + schemaVersion: string; + documents: DocumentInventoryItem[]; + extractedDocuments: ExtractedDocument[]; +}): AggregatedPreview { + const issues: ReviewIssue[] = []; + const references = args.extractedDocuments.map((doc) => doc.documentId); + + const w2Docs = args.extractedDocuments.filter((doc) => doc.documentType === 'W-2'); + const intDocs = args.extractedDocuments.filter((doc) => doc.documentType === '1099-INT'); + const divDocs = args.extractedDocuments.filter((doc) => doc.documentType === '1099-DIV'); + const mortgageDocs = args.extractedDocuments.filter((doc) => doc.documentType === '1098'); + const brokerageDocs = args.extractedDocuments.filter((doc) => doc.documentType === '1099-B'); + const compositeDocs = args.extractedDocuments.filter((doc) => doc.documentType === '1099-composite'); + const propertyTaxDocs = args.extractedDocuments.filter( + (doc) => doc.documentType === 'property-tax-bill' + ); + const retirementDocs = args.extractedDocuments.filter((doc) => doc.documentType === '1099-R'); + const iraContributionDocs = args.extractedDocuments.filter((doc) => doc.documentType === '5498'); + const brokerageLikeDocs = [...brokerageDocs, ...compositeDocs]; + + const wages = sumField(w2Docs, 'box1_wages'); + const federalWithholding = + sumField(w2Docs, 'box2_federal_withholding') + + sumField(intDocs, 'federal_withholding') + + sumField(divDocs, 'federal_withholding') + + sumField(compositeDocs, 'federal_withholding') + + sumField(retirementDocs, 'federal_withholding'); + const caWages = sumField(w2Docs, 'state_ca_wages'); + const caWithholding = + sumField(w2Docs, 'state_ca_withholding') + sumField(retirementDocs, 'state_withholding'); + const taxableInterest = + sumField(intDocs, 'interest_income') + sumField(compositeDocs, 'interest_income'); + const ordinaryDividends = + sumField(divDocs, 'ordinary_dividends') + sumField(compositeDocs, 'ordinary_dividends'); + const qualifiedDividends = + sumField(divDocs, 'qualified_dividends') + sumField(compositeDocs, 'qualified_dividends'); + const capitalGainDistributions = + sumField(divDocs, 'capital_gain_distributions') + + sumField(compositeDocs, 'capital_gain_distributions'); + const retirementTaxableAmount = sumField(retirementDocs, 'taxable_amount'); + const traditionalIraContributionPreview = sumConditionalField( + iraContributionDocs, + 'account_type', + 'traditional_ira', + 'contribution_amount' + ); + const mortgageInterest = sumField(mortgageDocs, 'mortgage_interest_received'); + const propertyTaxPaid = + sumField(mortgageDocs, 'property_tax_paid') + sumField(propertyTaxDocs, 'property_tax_paid'); + const pointsPaid = sumField(mortgageDocs, 'points_paid'); + const foreignTaxPaid = + sumField(intDocs, 'foreign_tax_paid') + + sumField(divDocs, 'foreign_tax_paid') + + sumField(compositeDocs, 'foreign_tax_paid'); + const taxExemptInterest = + sumField(intDocs, 'tax_exempt_interest') + + sumField(divDocs, 'tax_exempt_interest') + + sumField(compositeDocs, 'tax_exempt_interest'); + const specifiedPrivateActivityBondInterest = + sumField(intDocs, 'specified_private_activity_bond_interest') + + sumField(divDocs, 'specified_private_activity_bond_interest') + + sumField(compositeDocs, 'specified_private_activity_bond_interest'); + const section199aDividends = + sumField(divDocs, 'section_199a_dividends') + sumField(compositeDocs, 'section_199a_dividends'); + const nondividendDistributions = + sumField(divDocs, 'nondividend_distributions') + + sumField(compositeDocs, 'nondividend_distributions'); + const shortTermCoveredProceeds = sumField(brokerageLikeDocs, 'short_term_covered_proceeds'); + const shortTermCoveredBasis = sumField(brokerageLikeDocs, 'short_term_covered_basis'); + const shortTermWashSaleAdjustments = sumField( + brokerageLikeDocs, + 'short_term_wash_sale_adjustments' + ); + const longTermCoveredProceeds = sumField(brokerageLikeDocs, 'long_term_covered_proceeds'); + const longTermCoveredBasis = sumField(brokerageLikeDocs, 'long_term_covered_basis'); + const longTermWashSaleAdjustments = sumField( + brokerageLikeDocs, + 'long_term_wash_sale_adjustments' + ); + const shortTermNetGainLossPreview = sumNetGainField( + brokerageLikeDocs, + 'short_term_covered_net_gain_loss', + 'short_term_covered_proceeds', + 'short_term_covered_basis', + 'short_term_wash_sale_adjustments' + ); + const longTermNetGainLossPreview = sumNetGainField( + brokerageLikeDocs, + 'long_term_covered_net_gain_loss', + 'long_term_covered_proceeds', + 'long_term_covered_basis', + 'long_term_wash_sale_adjustments' + ); + const totalIncomePreview = roundMoney( + wages + + taxableInterest + + ordinaryDividends + + retirementTaxableAmount + + capitalGainDistributions + + shortTermNetGainLossPreview + + longTermNetGainLossPreview + ); + + const reconciliationChecks: AggregatedPreview['reconciliationChecks'] = [ + { + name: 'w2_present', + status: w2Docs.length > 0 ? 'pass' : 'blocking', + message: w2Docs.length > 0 ? 'Found at least one W-2 document.' : 'No W-2 document extracted.', + }, + { + name: 'qualified_dividends_not_greater_than_ordinary', + status: qualifiedDividends <= ordinaryDividends ? 'pass' : 'blocking', + message: + qualifiedDividends <= ordinaryDividends + ? 'Qualified dividends reconcile against ordinary dividends.' + : 'Qualified dividends exceed ordinary dividends.', + }, + { + name: 'ca_wages_match_w2_count', + status: w2Docs.length === 0 || caWages > 0 ? 'pass' : 'warning', + message: + w2Docs.length === 0 || caWages > 0 + ? 'California wage fields are present for extracted W-2 data.' + : 'California wages are missing from extracted W-2 data.', + }, + { + name: 'mortgage_interest_supported_if_1098_present', + status: mortgageDocs.length === 0 || mortgageInterest > 0 ? 'pass' : 'warning', + message: + mortgageDocs.length === 0 || mortgageInterest > 0 + ? 'Mortgage interest preview is populated when a 1098 is present.' + : '1098 extracted but mortgage interest is missing.', + }, + { + name: 'property_tax_payment_supported_if_property_tax_bill_present', + status: propertyTaxDocs.length === 0 || propertyTaxPaid > 0 ? 'pass' : 'warning', + message: + propertyTaxDocs.length === 0 || propertyTaxPaid > 0 + ? 'Property tax preview is populated when a paid property tax bill is present.' + : 'Property tax bill extracted but no paid deductible property tax amount was derived.', + }, + ]; + + if (w2Docs.length === 0) { + issues.push({ + schemaVersion: args.schemaVersion, + severity: 'blocking', + code: 'MISSING_W2_EXTRACTION', + message: 'A W-2 is required for this return path but was not extracted.', + impactedArea: 'reconciliation', + sourceReferences: [], + suggestedNextAction: 'Add a W-2 document and its mock sidecar or implement live extraction.', + }); + } + + if (qualifiedDividends > ordinaryDividends) { + issues.push({ + schemaVersion: args.schemaVersion, + severity: 'blocking', + code: 'DIVIDEND_RECONCILIATION_FAILED', + message: 'Qualified dividends exceed ordinary dividends.', + impactedArea: 'reconciliation', + sourceReferences: [...divDocs, ...compositeDocs].map((doc) => doc.documentId), + suggestedNextAction: 'Fix the dividend extraction values before continuing.', + }); + } + + const confidence = issues.some((issue) => issue.severity === 'blocking') + ? 'low' + : args.extractedDocuments.length > 0 + ? 'medium' + : 'low'; + + return { + federalReturnInputs: { + status: 'preview_aggregated', + wages: meta(wages, 'normalized', w2Docs.map((doc) => doc.documentId)), + taxable_interest: meta( + taxableInterest, + 'normalized', + [...intDocs, ...compositeDocs].map((doc) => doc.documentId) + ), + tax_exempt_interest: meta( + taxExemptInterest, + 'normalized', + [...intDocs, ...divDocs, ...compositeDocs].map((doc) => doc.documentId) + ), + foreign_tax_paid: meta( + foreignTaxPaid, + 'normalized', + [...intDocs, ...divDocs, ...compositeDocs].map((doc) => doc.documentId) + ), + ordinary_dividends: meta( + ordinaryDividends, + 'normalized', + [...divDocs, ...compositeDocs].map((doc) => doc.documentId) + ), + qualified_dividends: meta( + qualifiedDividends, + 'normalized', + [...divDocs, ...compositeDocs].map((doc) => doc.documentId) + ), + capital_gain_distributions: meta( + capitalGainDistributions, + 'normalized', + [...divDocs, ...compositeDocs].map((doc) => doc.documentId) + ), + specified_private_activity_bond_interest: meta( + specifiedPrivateActivityBondInterest, + 'normalized', + [...intDocs, ...divDocs, ...compositeDocs].map((doc) => doc.documentId) + ), + section_199a_dividends: meta( + section199aDividends, + 'normalized', + [...divDocs, ...compositeDocs].map((doc) => doc.documentId) + ), + nondividend_distributions: meta( + nondividendDistributions, + 'normalized', + [...divDocs, ...compositeDocs].map((doc) => doc.documentId) + ), + retirement_taxable_amount: meta( + retirementTaxableAmount, + 'normalized', + retirementDocs.map((doc) => doc.documentId) + ), + short_term_covered_proceeds: meta( + shortTermCoveredProceeds, + 'normalized', + brokerageLikeDocs.map((doc) => doc.documentId) + ), + short_term_covered_basis: meta( + shortTermCoveredBasis, + 'normalized', + brokerageLikeDocs.map((doc) => doc.documentId) + ), + short_term_wash_sale_adjustments: meta( + shortTermWashSaleAdjustments, + 'normalized', + brokerageLikeDocs.map((doc) => doc.documentId) + ), + short_term_covered_net_gain_loss_preview: meta( + shortTermNetGainLossPreview, + 'computed', + brokerageLikeDocs.map((doc) => doc.documentId) + ), + long_term_covered_proceeds: meta( + longTermCoveredProceeds, + 'normalized', + brokerageLikeDocs.map((doc) => doc.documentId) + ), + long_term_covered_basis: meta( + longTermCoveredBasis, + 'normalized', + brokerageLikeDocs.map((doc) => doc.documentId) + ), + long_term_wash_sale_adjustments: meta( + longTermWashSaleAdjustments, + 'normalized', + brokerageLikeDocs.map((doc) => doc.documentId) + ), + long_term_covered_net_gain_loss_preview: meta( + longTermNetGainLossPreview, + 'computed', + brokerageLikeDocs.map((doc) => doc.documentId) + ), + mortgage_interest_preview: meta( + mortgageInterest, + 'normalized', + mortgageDocs.map((doc) => doc.documentId) + ), + property_tax_preview: meta( + propertyTaxPaid, + 'normalized', + [...mortgageDocs, ...propertyTaxDocs].map((doc) => doc.documentId) + ), + points_paid_preview: meta(pointsPaid, 'normalized', mortgageDocs.map((doc) => doc.documentId)), + federal_withholding: meta(federalWithholding, 'normalized', references), + total_income_preview: meta(totalIncomePreview, 'computed', references), + traditional_ira_contribution_preview: meta( + traditionalIraContributionPreview, + 'normalized', + iraContributionDocs.map((doc) => doc.documentId) + ), + }, + caReturnInputs: { + status: 'preview_aggregated', + california_wages: meta(caWages, 'normalized', w2Docs.map((doc) => doc.documentId)), + california_withholding: meta(caWithholding, 'normalized', w2Docs.map((doc) => doc.documentId)), + federal_starting_income_preview: meta(totalIncomePreview, 'computed', references), + }, + estimateSummary: { + confidence, + federalRefundOrAmountOwed: meta(null, 'computed', []), + caRefundOrAmountOwed: meta(null, 'computed', []), + deductionChoice: { + federal: 'unknown', + california: 'unknown', + }, + optimizations: [], + }, + reconciliationChecks, + issues, + }; +} + +function sumField(documents: ExtractedDocument[], fieldName: string): number { + let total = 0; + for (const document of documents) { + const raw = document.fields[fieldName]?.value; + if (typeof raw === 'number') { + total += raw; + } + } + return roundMoney(total); +} + +function sumConditionalField( + documents: ExtractedDocument[], + conditionField: string, + expectedValue: string, + valueField: string +): number { + let total = 0; + for (const document of documents) { + const condition = document.fields[conditionField]?.value; + const raw = document.fields[valueField]?.value; + if (condition === expectedValue && typeof raw === 'number') { + total += raw; + } + } + return roundMoney(total); +} + +function sumNetGainField( + documents: ExtractedDocument[], + netField: string, + proceedsField: string, + basisField: string, + washField: string +): number { + let total = 0; + for (const document of documents) { + const directNet = document.fields[netField]?.value; + if (typeof directNet === 'number') { + total += directNet; + continue; + } + + const proceeds = document.fields[proceedsField]?.value; + const basis = document.fields[basisField]?.value; + const wash = document.fields[washField]?.value; + if (typeof proceeds === 'number' && typeof basis === 'number') { + total += proceeds - basis + (typeof wash === 'number' ? wash : 0); + } + } + return roundMoney(total); +} + +function roundMoney(value: number): number { + return Math.round(value * 100) / 100; +} + +function meta( + value: number | string | boolean | null, + derivationType: ValueWithMeta['derivationType'], + references: string[] +): ValueWithMeta { + return { + value, + derivationType, + references, + }; +} diff --git a/plugins/tax/src/compute/california-return.ts b/plugins/tax/src/compute/california-return.ts new file mode 100644 index 0000000..f8867d4 --- /dev/null +++ b/plugins/tax/src/compute/california-return.ts @@ -0,0 +1,125 @@ +import { + CALIFORNIA_DEPENDENT_EXEMPTION_CREDIT_2025, + CALIFORNIA_EXEMPTION_CREDIT_AGI_THRESHOLD_2025, + CALIFORNIA_ITEMIZED_LIMIT_AGI_THRESHOLD_2025, + CALIFORNIA_PERSONAL_EXEMPTION_CREDIT_2025, + CALIFORNIA_STANDARD_DEDUCTION_2025, + CALIFORNIA_TAX_BRACKETS_2025, +} from '../rules/california-2025.js'; +import type { FilingStatus } from '../types.js'; + +export interface CaliforniaComputationInput { + filingStatus: FilingStatus; + federalAgi: number; + californiaWages: number; + mortgageInterest: number; + pointsPaid: number; + propertyTaxPaid: number; + californiaWithholding: number; + californiaEstimatedPayments: number; + dependents: number; +} + +export interface CaliforniaComputationResult { + californiaAgi: number; + deductionChoice: 'standard' | 'itemized'; + standardDeduction: number; + itemizedDeduction: number; + taxableIncome: number; + taxBeforeCredits: number; + exemptionCredits: number; + taxAfterCredits: number; + refundOrAmountOwed: number; + issues: string[]; +} + +export function computeCaliforniaReturn( + input: CaliforniaComputationInput +): CaliforniaComputationResult { + const issues: string[] = []; + const californiaAgi = input.federalAgi; + const standardDeduction = CALIFORNIA_STANDARD_DEDUCTION_2025[input.filingStatus]; + const itemizedDeduction = roundMoney(input.mortgageInterest + input.pointsPaid + input.propertyTaxPaid); + + if (californiaAgi > CALIFORNIA_ITEMIZED_LIMIT_AGI_THRESHOLD_2025[input.filingStatus]) { + issues.push( + 'California itemized deduction limitation above the 2025 AGI threshold is not implemented; using the unreduced itemized amount.' + ); + } + + const deductionChoice = itemizedDeduction > standardDeduction ? 'itemized' : 'standard'; + const chosenDeduction = deductionChoice === 'itemized' ? itemizedDeduction : standardDeduction; + const taxableIncome = roundMoney(Math.max(0, californiaAgi - chosenDeduction)); + const taxBeforeCredits = computeCaliforniaTax(taxableIncome, input.filingStatus); + const exemptionCredits = computeCaliforniaExemptionCredits({ + filingStatus: input.filingStatus, + agi: californiaAgi, + dependents: input.dependents, + }); + const taxAfterCredits = roundMoney(Math.max(0, taxBeforeCredits - exemptionCredits)); + const payments = roundMoney(input.californiaWithholding + input.californiaEstimatedPayments); + const refundOrAmountOwed = roundMoney(payments - taxAfterCredits); + + return { + californiaAgi, + deductionChoice, + standardDeduction, + itemizedDeduction, + taxableIncome, + taxBeforeCredits, + exemptionCredits, + taxAfterCredits, + refundOrAmountOwed, + issues, + }; +} + +function computeCaliforniaExemptionCredits(args: { + filingStatus: FilingStatus; + agi: number; + dependents: number; +}): number { + const personalCount = args.filingStatus === 'mfj' ? 2 : 1; + const baseCredit = + personalCount * CALIFORNIA_PERSONAL_EXEMPTION_CREDIT_2025 + + args.dependents * CALIFORNIA_DEPENDENT_EXEMPTION_CREDIT_2025; + const threshold = CALIFORNIA_EXEMPTION_CREDIT_AGI_THRESHOLD_2025[args.filingStatus]; + + if (args.agi <= threshold) { + return baseCredit; + } + + const reductionUnits = Math.ceil((args.agi - threshold) / 2500); + const reduction = reductionUnits * 6 * (personalCount + args.dependents); + return Math.max(0, baseCredit - reduction); +} + +export function computeCaliforniaTax(taxableIncome: number, filingStatus: FilingStatus): number { + if (taxableIncome <= 0) { + return 0; + } + + let remaining = taxableIncome; + let previousLimit = 0; + let tax = 0; + + for (const bracket of CALIFORNIA_TAX_BRACKETS_2025[filingStatus]) { + const bracketWidth = bracket.upTo - previousLimit; + const amountInBracket = Math.min(remaining, bracketWidth); + if (amountInBracket <= 0) { + break; + } + tax += amountInBracket * bracket.rate; + remaining -= amountInBracket; + previousLimit = bracket.upTo; + if (remaining <= 0) { + break; + } + } + + return roundMoney(tax); +} + +function roundMoney(value: number): number { + return Math.round(value * 100) / 100; +} diff --git a/plugins/tax/src/compute/federal-return.ts b/plugins/tax/src/compute/federal-return.ts new file mode 100644 index 0000000..ba2cc4b --- /dev/null +++ b/plugins/tax/src/compute/federal-return.ts @@ -0,0 +1,277 @@ +import { + FEDERAL_ORDINARY_BRACKETS_2025, + FEDERAL_QD_FIFTEEN_RATE_THRESHOLD_2025, + FEDERAL_QD_ZERO_RATE_THRESHOLD_2025, + FEDERAL_SALT_LIMIT_2025, + FEDERAL_SALT_MIN_FLOOR_2025, + FEDERAL_SALT_PHASEDOWN_AGI_2025, + FEDERAL_STANDARD_DEDUCTION_2025, + TRADITIONAL_IRA_MAX_CONTRIBUTION_UNDER_50_2025, + TRADITIONAL_IRA_PHASEOUT_START_2025, + TRADITIONAL_IRA_PHASEOUT_WIDTH_2025, +} from '../rules/federal-2025.js'; +import type { FilingStatus } from '../types.js'; + +export interface FederalComputationInput { + filingStatus: FilingStatus; + wages: number; + taxableInterest: number; + ordinaryDividends: number; + qualifiedDividends: number; + retirementTaxableAmount: number; + capitalGainDistributions: number; + shortTermNetGainLoss: number; + longTermNetGainLoss: number; + mortgageInterest: number; + pointsPaid: number; + propertyTaxPaid: number; + stateIncomeTaxPaid: number; + foreignTaxPaid: number; + federalWithholding: number; + federalEstimatedPayments: number; + traditionalIraContributions: number; + section199aDividends: number; + workplaceRetirementCovered: boolean; +} + +export interface FederalComputationResult { + agi: number; + traditionalIraDeduction: number; + section199aDeduction: number; + deductionChoice: 'standard' | 'itemized'; + standardDeduction: number; + itemizedDeduction: number; + taxableIncome: number; + netCapitalGain: number; + taxBeforePayments: number; + foreignTaxCreditApplied: number; + refundOrAmountOwed: number; + saltDeduction: number; + issues: string[]; +} + +export function computeFederalReturn(input: FederalComputationInput): FederalComputationResult { + const issues: string[] = []; + + const capitalNetPreview = roundMoney( + input.shortTermNetGainLoss + input.longTermNetGainLoss + input.capitalGainDistributions + ); + + const maxAboveLineIra = computeTraditionalIraDeduction({ + filingStatus: input.filingStatus, + wages: input.wages, + agiBeforeIra: roundMoney( + input.wages + + input.taxableInterest + + input.ordinaryDividends + + input.retirementTaxableAmount + + capitalNetPreview + ), + contributionAmount: input.traditionalIraContributions, + workplaceRetirementCovered: input.workplaceRetirementCovered, + }); + + const agi = roundMoney( + input.wages + + input.taxableInterest + + input.ordinaryDividends + + input.retirementTaxableAmount + + capitalNetPreview - + maxAboveLineIra + ); + + let saltDeduction: number; + if (agi <= FEDERAL_SALT_PHASEDOWN_AGI_2025[input.filingStatus]) { + saltDeduction = Math.min( + roundMoney(input.propertyTaxPaid + input.stateIncomeTaxPaid), + FEDERAL_SALT_LIMIT_2025[input.filingStatus] + ); + } else { + issues.push( + 'Federal SALT phase-down above the 2025 AGI threshold is not implemented; using the minimum floor deduction.' + ); + saltDeduction = FEDERAL_SALT_MIN_FLOOR_2025[input.filingStatus]; + } + + const itemizedDeduction = roundMoney(input.mortgageInterest + input.pointsPaid + saltDeduction); + const standardDeduction = FEDERAL_STANDARD_DEDUCTION_2025[input.filingStatus]; + const deductionChoice = itemizedDeduction > standardDeduction ? 'itemized' : 'standard'; + const chosenDeduction = deductionChoice === 'itemized' ? itemizedDeduction : standardDeduction; + const taxableIncomeBeforeQbi = roundMoney(Math.max(0, agi - chosenDeduction)); + const netCapitalGain = Math.max(0, capitalNetPreview); + const section199aDeduction = computeSection199aDeduction({ + section199aDividends: input.section199aDividends, + taxableIncomeBeforeQbi, + netCapitalGain, + }); + const taxableIncome = roundMoney(Math.max(0, taxableIncomeBeforeQbi - section199aDeduction)); + + const taxBeforeCredits = + input.qualifiedDividends > 0 || netCapitalGain > 0 + ? computeQualifiedDividendTax({ + filingStatus: input.filingStatus, + taxableIncome, + qualifiedDividends: input.qualifiedDividends, + netCapitalGain, + }) + : computeOrdinaryTax(taxableIncome, input.filingStatus); + const foreignTaxCreditApplied = computeDirectForeignTaxCredit( + input.foreignTaxPaid, + input.filingStatus + ); + if (input.foreignTaxPaid > foreignTaxCreditApplied) { + issues.push( + 'Foreign tax paid exceeds the simplified direct-credit limit used by this estimator; review the foreign tax credit in filing software.' + ); + } + const taxBeforePayments = roundMoney(Math.max(0, taxBeforeCredits - foreignTaxCreditApplied)); + + const payments = roundMoney(input.federalWithholding + input.federalEstimatedPayments); + const refundOrAmountOwed = roundMoney(payments - taxBeforePayments); + + return { + agi, + traditionalIraDeduction: maxAboveLineIra, + section199aDeduction, + deductionChoice, + standardDeduction, + itemizedDeduction, + taxableIncome, + netCapitalGain, + taxBeforePayments: roundMoney(taxBeforePayments), + foreignTaxCreditApplied, + refundOrAmountOwed, + saltDeduction, + issues, + }; +} + +function computeTraditionalIraDeduction(args: { + filingStatus: FilingStatus; + wages: number; + agiBeforeIra: number; + contributionAmount: number; + workplaceRetirementCovered: boolean; +}): number { + const earnedIncomeCap = Math.max(0, args.wages); + const contributionCap = Math.min( + TRADITIONAL_IRA_MAX_CONTRIBUTION_UNDER_50_2025, + args.contributionAmount, + earnedIncomeCap + ); + + if (contributionCap <= 0) { + return 0; + } + + if (!args.workplaceRetirementCovered) { + return contributionCap; + } + + const start = TRADITIONAL_IRA_PHASEOUT_START_2025[args.filingStatus]; + const width = TRADITIONAL_IRA_PHASEOUT_WIDTH_2025[args.filingStatus]; + const end = start + width; + + if (args.agiBeforeIra >= end) { + return 0; + } + if (args.agiBeforeIra <= start) { + return contributionCap; + } + + const ratio = (end - args.agiBeforeIra) / width; + return roundIraDeduction(contributionCap * ratio); +} + +function roundIraDeduction(value: number): number { + const roundedUpToTen = Math.ceil(value / 10) * 10; + if (roundedUpToTen < 200) { + return 200; + } + return roundedUpToTen; +} + +function computeSection199aDeduction(args: { + section199aDividends: number; + taxableIncomeBeforeQbi: number; + netCapitalGain: number; +}): number { + if (args.section199aDividends <= 0 || args.taxableIncomeBeforeQbi <= 0) { + return 0; + } + + const dividendComponent = args.section199aDividends * 0.2; + const taxableIncomeLimit = Math.max(0, args.taxableIncomeBeforeQbi - args.netCapitalGain) * 0.2; + return roundMoney(Math.min(dividendComponent, taxableIncomeLimit)); +} + +function computeDirectForeignTaxCredit(foreignTaxPaid: number, filingStatus: FilingStatus): number { + if (foreignTaxPaid <= 0) { + return 0; + } + + const limit = filingStatus === 'mfj' ? 600 : 300; + return roundMoney(Math.min(foreignTaxPaid, limit)); +} + +function computeQualifiedDividendTax(args: { + filingStatus: FilingStatus; + taxableIncome: number; + qualifiedDividends: number; + netCapitalGain: number; +}): number { + const line1 = args.taxableIncome; + const line2 = args.qualifiedDividends; + const line3 = args.netCapitalGain; + const line4 = roundMoney(line2 + line3); + const line5 = Math.max(0, roundMoney(line1 - line4)); + const zeroThreshold = FEDERAL_QD_ZERO_RATE_THRESHOLD_2025[args.filingStatus]; + const line7 = Math.min(line1, zeroThreshold); + const line8 = Math.min(line5, line7); + const line9 = Math.max(0, roundMoney(line7 - line8)); + const line10 = Math.min(line1, line4); + const line12 = Math.max(0, roundMoney(line10 - line9)); + const fifteenThreshold = FEDERAL_QD_FIFTEEN_RATE_THRESHOLD_2025[args.filingStatus]; + const line14 = Math.min(line1, fifteenThreshold); + const line15 = roundMoney(line5 + line9); + const line16 = Math.max(0, roundMoney(line14 - line15)); + const line17 = Math.min(line12, line16); + const line18 = roundMoney(line17 * 0.15); + const line19 = roundMoney(line9 + line17); + const line20 = Math.max(0, roundMoney(line10 - line19)); + const line21 = roundMoney(line20 * 0.2); + const line22 = computeOrdinaryTax(line5, args.filingStatus); + const line23 = roundMoney(line18 + line21 + line22); + const line24 = computeOrdinaryTax(line1, args.filingStatus); + return Math.min(line23, line24); +} + +export function computeOrdinaryTax(taxableIncome: number, filingStatus: FilingStatus): number { + if (taxableIncome <= 0) { + return 0; + } + + let remaining = taxableIncome; + let previousLimit = 0; + let tax = 0; + + for (const bracket of FEDERAL_ORDINARY_BRACKETS_2025[filingStatus]) { + const bracketWidth = bracket.upTo - previousLimit; + const amountInBracket = Math.min(remaining, bracketWidth); + if (amountInBracket <= 0) { + break; + } + tax += amountInBracket * bracket.rate; + remaining -= amountInBracket; + previousLimit = bracket.upTo; + if (remaining <= 0) { + break; + } + } + + return roundMoney(tax); +} + +function roundMoney(value: number): number { + return Math.round(value * 100) / 100; +} diff --git a/plugins/tax/src/extraction/deterministic.ts b/plugins/tax/src/extraction/deterministic.ts new file mode 100644 index 0000000..9c56a41 --- /dev/null +++ b/plugins/tax/src/extraction/deterministic.ts @@ -0,0 +1,666 @@ +import { execFileSync } from 'node:child_process'; +import * as path from 'node:path'; +import type { DocumentInventoryItem, DocumentType, ExtractedDocument } from '../types.js'; + +const MAX_PDF_TEXT_BYTES = 4 * 1024 * 1024; + +type ParsedValue = number | string | boolean | null; + +interface ParsedDeterministicDocument { + taxYear: number | null; + payerOrIssuer: string | null; + confidence: 'high' | 'medium' | 'low'; + fields: Record; +} + +const SUPPORTED_DETERMINISTIC_TYPES = new Set(['1099-composite', 'property-tax-bill']); + +export function canExtractDeterministicDocumentType(documentType: DocumentType): boolean { + return SUPPORTED_DETERMINISTIC_TYPES.has(documentType); +} + +export function extractDeterministicDocument(args: { + inputDir: string; + document: DocumentInventoryItem; +}): ExtractedDocument | null { + const { inputDir, document } = args; + if (!canExtractDeterministicDocumentType(document.detectedFormType)) { + return null; + } + + const absolutePath = path.join(inputDir, document.filePath); + if (path.extname(absolutePath).toLowerCase() !== '.pdf') { + return null; + } + + const text = readPdfText(absolutePath); + if (!text) { + return null; + } + + let parsed: ParsedDeterministicDocument | null = null; + if (document.detectedFormType === '1099-composite') { + parsed = parse1099CompositeText(text, document.fileName); + } else if (document.detectedFormType === 'property-tax-bill') { + parsed = parsePropertyTaxBillText(text, document.fileName); + } + + if (!parsed) { + return null; + } + + const fields = Object.fromEntries( + Object.entries(parsed.fields).map(([key, value]) => [ + key, + { + value, + confidence: parsed.confidence, + sourceFile: document.filePath, + sourcePage: 1, + }, + ]) + ); + + return { + schemaVersion: document.schemaVersion, + documentId: document.id, + documentType: document.detectedFormType, + taxYear: parsed.taxYear ?? document.taxYear, + payerOrIssuer: parsed.payerOrIssuer ?? document.issuerOrPayer, + extractionMethod: 'deterministic', + confidence: parsed.confidence, + fields, + }; +} + +export function parse1099CompositeText( + text: string, + fileName = 'document.pdf' +): ParsedDeterministicDocument | null { + const inline = normalizeInline(text); + const provider = detectCompositeProvider(inline, fileName); + + switch (provider) { + case 'fidelity': + return parseFidelityCompositeText(text, inline); + case 'pershing': + return parsePershingCompositeText(inline); + case 'robinhood': + return parseRobinhoodCompositeText(inline); + case 'schwab': + return parseSchwabCompositeText(inline); + default: + return null; + } +} + +export function parsePropertyTaxBillText( + text: string, + _fileName = 'property-tax-bill.pdf' +): ParsedDeterministicDocument | null { + const inline = normalizeInline(text); + const fullYearTaxAmount = extractMoneyAfterLabel(inline, 'tax amount'); + const directChargesTotal = extractMoneyAfterLabel( + inline, + 'total direct charges and special assessments' + ); + const firstInstallmentAmount = extractMoneyAfterLooseLabel(inline, '1st installment due'); + const secondInstallmentAmount = extractMoneyAfterLooseLabel(inline, '2nd installment due'); + const paidDate = extractDateAfterLabel(inline, 'paid'); + + if (fullYearTaxAmount === null || firstInstallmentAmount === null) { + return null; + } + + const firstInstallmentPaid = paidDate !== null; + const deductiblePropertyTaxPaid = firstInstallmentPaid + ? roundMoney(fullYearTaxAmount / 2) + : 0; + const installmentDirectCharges = + directChargesTotal !== null ? roundMoney(directChargesTotal / 2) : null; + + return { + taxYear: extractReportingYear(text), + payerOrIssuer: inline.includes('city & county of san francisco') + ? 'CITY & COUNTY OF SAN FRANCISCO' + : 'LOCAL TAX COLLECTOR', + confidence: firstInstallmentPaid ? 'high' : 'medium', + fields: compactFields({ + property_tax_paid: deductiblePropertyTaxPaid, + property_tax_bill_tax_amount_full_year: fullYearTaxAmount, + property_tax_bill_direct_charges_total: directChargesTotal, + property_tax_first_installment_amount: firstInstallmentAmount, + property_tax_second_installment_amount: secondInstallmentAmount, + property_tax_first_installment_paid: firstInstallmentPaid, + property_tax_first_installment_paid_date: paidDate, + property_tax_first_installment_direct_charges_allocated: installmentDirectCharges, + }), + }; +} + +function parseFidelityCompositeText( + rawText: string, + inline: string +): ParsedDeterministicDocument | null { + const topSection = + sliceInlineBetween( + inline, + '2025 tax reporting statement', + 'summary of 2025 proceeds from broker and barter exchange transactions' + ) || inline; + const summarySection = sliceInlineBetween( + inline, + 'summary of 2025 proceeds from broker and barter exchange transactions', + '1099-b amounts are reported individually to the irs' + ); + + const summaryValues = summarySection ? extractMoneyValues(summarySection) : []; + const rows = chunk(summaryValues.slice(0, 36), 6); + if (rows.length < 5) { + return null; + } + + const shortCovered = rows[0]; + const shortNoncovered = rows[1]; + const longCovered = rows[2]; + const longNoncovered = rows[3]; + const unknownTerm = rows[4]; + + return { + taxYear: extractReportingYear(rawText), + payerOrIssuer: inline.includes('national financial services llc') + ? 'NATIONAL FINANCIAL SERVICES LLC' + : 'FIDELITY BROKERAGE SERVICES LLC', + confidence: 'high', + fields: compactFields({ + interest_income: extractMoneyAfterLabel(topSection, '1 interest income'), + ordinary_dividends: extractMoneyAfterLabel(topSection, '1a total ordinary dividends'), + qualified_dividends: extractMoneyAfterLabel(topSection, '1b qualified dividends'), + capital_gain_distributions: extractMoneyAfterLabel( + topSection, + '2a total capital gain distributions' + ), + nondividend_distributions: extractMoneyAfterLabel(topSection, '3 nondividend distributions'), + section_199a_dividends: extractMoneyAfterLabel(topSection, '5 section 199a dividends'), + federal_withholding: roundMoney( + sumValues(extractAllMoneyAfterLabel(topSection, '4 federal income tax withheld')) + ), + foreign_tax_paid: roundMoney( + sumValues([ + ...extractAllMoneyAfterLabel(topSection, '7 foreign tax paid'), + ...extractAllMoneyAfterLabel(topSection, '6 foreign tax paid'), + ]) + ), + tax_exempt_interest: roundMoney( + sumValues([ + ...extractAllMoneyAfterLabel(topSection, '12 exempt interest dividends'), + ...extractAllMoneyAfterLabel(topSection, '8 tax-exempt interest'), + ]) + ), + specified_private_activity_bond_interest: roundMoney( + sumValues([ + ...extractAllMoneyAfterLabel( + topSection, + '13 specified private activity bond interest dividends' + ), + ...extractAllMoneyAfterLabel(topSection, '9 specified private activity bond interest'), + ]) + ), + short_term_covered_proceeds: shortCovered[0] ?? null, + short_term_covered_basis: shortCovered[1] ?? null, + short_term_wash_sale_adjustments: shortCovered[3] ?? null, + short_term_covered_net_gain_loss: shortCovered[4] ?? null, + long_term_covered_proceeds: longCovered[0] ?? null, + long_term_covered_basis: longCovered[1] ?? null, + long_term_wash_sale_adjustments: longCovered[3] ?? null, + long_term_covered_net_gain_loss: longCovered[4] ?? null, + has_unsupported_brokerage_rows: + hasNonZeroValues(shortNoncovered) || + hasNonZeroValues(longNoncovered) || + hasNonZeroValues(unknownTerm), + }), + }; +} + +function parsePershingCompositeText(inline: string): ParsedDeterministicDocument | null { + const intSection = sliceInlineBetween(inline, '2025 1099-int', 'box 1a'); + const divSection = sliceInlineBetween(inline, 'box 1a', 'summary of form 1099-oid'); + const shortCoveredSection = sliceInlineBetween( + inline, + 'short-term covered total', + 'long-term covered total' + ); + const longCoveredSection = sliceInlineBetween( + inline, + 'long-term covered total', + 'tax lot default disposition method' + ); + + if (!intSection || !divSection || !shortCoveredSection || !longCoveredSection) { + return null; + } + + const intValues = extractMoneyValues(intSection); + const divValues = extractMoneyValues(divSection); + const shortCovered = extractDollarMoneyValues(shortCoveredSection).slice(0, 3); + const longCovered = extractDollarMoneyValues(longCoveredSection).slice(0, 3); + + if (intValues.length < 7 || divValues.length < 17 || shortCovered.length < 3 || longCovered.length < 3) { + return null; + } + + return { + taxYear: extractReportingYear(inline), + payerOrIssuer: 'PERSHING LLC', + confidence: 'high', + fields: compactFields({ + interest_income: intValues[0] ?? null, + federal_withholding: roundMoney((intValues[2] ?? 0) + (divValues[9] ?? 0)), + foreign_tax_paid: roundMoney((intValues[4] ?? 0) + (divValues[12] ?? 0)), + tax_exempt_interest: roundMoney((intValues[5] ?? 0) + (divValues[15] ?? 0)), + specified_private_activity_bond_interest: roundMoney( + (intValues[6] ?? 0) + (divValues[16] ?? 0) + ), + ordinary_dividends: divValues[0] ?? null, + qualified_dividends: divValues[1] ?? null, + capital_gain_distributions: divValues[2] ?? null, + nondividend_distributions: divValues[8] ?? null, + section_199a_dividends: divValues[10] ?? null, + short_term_covered_proceeds: shortCovered[0] ?? null, + short_term_covered_basis: shortCovered[1] ?? null, + short_term_wash_sale_adjustments: 0, + short_term_covered_net_gain_loss: shortCovered[2] ?? null, + long_term_covered_proceeds: longCovered[0] ?? null, + long_term_covered_basis: longCovered[1] ?? null, + long_term_wash_sale_adjustments: 0, + long_term_covered_net_gain_loss: longCovered[2] ?? null, + has_unsupported_brokerage_rows: false, + }), + }; +} + +function parseRobinhoodCompositeText(inline: string): ParsedDeterministicDocument | null { + const divSection = sliceInlineBetween(inline, '2025 1099-div', '2025 1099-misc'); + const intSection = sliceInlineBetween( + inline, + '2025 1099-int', + 'the following amounts are not reported to the irs' + ); + const shortCoveredSection = sliceInlineBetween( + inline, + 'a (basis reported to the irs)', + 'b (basis not reported to the irs)' + ); + const shortNoncoveredSection = sliceInlineBetween( + inline, + 'b (basis not reported to the irs)', + 'c (form 1099-b not received)' + ); + const shortUnknownSection = sliceInlineBetween( + inline, + 'c (form 1099-b not received)', + 'total short-term' + ); + const longCoveredSection = sliceInlineBetween( + inline, + 'd (basis reported to the irs)', + 'e (basis not reported to the irs)' + ); + const longNoncoveredSection = sliceInlineBetween( + inline, + 'e (basis not reported to the irs)', + 'f (form 1099-b not received)' + ); + const longUnknownSection = sliceInlineBetween( + inline, + 'f (form 1099-b not received)', + 'total long-term' + ); + const undeterminedSection = sliceInlineBetween( + inline, + 'total undetermined-term', + 'grand total' + ); + const shortTotalSection = sliceInlineBetween( + inline, + 'total short-term', + 'd (basis reported to the irs)' + ); + const longTotalSection = sliceInlineBetween( + inline, + 'total long-term', + 'b or e (basis not reported to the irs)' + ); + + if ( + !divSection || + !intSection || + !shortNoncoveredSection || + !shortUnknownSection || + !longNoncoveredSection || + !longUnknownSection || + !undeterminedSection || + !shortTotalSection || + !longTotalSection + ) { + return null; + } + + const divValues = extractMoneyValues(divSection); + const intValues = extractMoneyValues(intSection); + let shortCovered = shortCoveredSection ? extractMoneyValues(shortCoveredSection).slice(0, 5) : []; + const shortNoncovered = extractMoneyValues(shortNoncoveredSection).slice(0, 5); + const shortUnknown = extractMoneyValues(shortUnknownSection).slice(0, 5); + let longCovered = longCoveredSection ? extractMoneyValues(longCoveredSection).slice(0, 5) : []; + const longNoncovered = extractMoneyValues(longNoncoveredSection).slice(0, 5); + const longUnknown = extractMoneyValues(longUnknownSection).slice(0, 5); + const undetermined = extractMoneyValues(undeterminedSection).slice(0, 5); + const shortTotals = extractMoneyValues(shortTotalSection).slice(0, 5); + const longTotals = extractMoneyValues(longTotalSection).slice(0, 5); + + if (shortCovered.length === 0 && longCovered.length === 0) { + if ( + hasNonZeroValues(shortTotals) || + hasNonZeroValues(longTotals) || + hasNonZeroValues(undetermined) + ) { + return null; + } + shortCovered = [0, 0, 0, 0, 0]; + longCovered = [0, 0, 0, 0, 0]; + } + + if (divValues.length < 17 || intValues.length < 9 || shortCovered.length < 5 || longCovered.length < 5) { + return null; + } + + return { + taxYear: extractReportingYear(inline), + payerOrIssuer: 'ROBINHOOD MARKETS INC', + confidence: 'high', + fields: compactFields({ + interest_income: intValues[0] ?? null, + ordinary_dividends: divValues[0] ?? null, + qualified_dividends: divValues[1] ?? null, + capital_gain_distributions: divValues[2] ?? null, + nondividend_distributions: divValues[8] ?? null, + federal_withholding: roundMoney((divValues[9] ?? 0) + (intValues[3] ?? 0)), + section_199a_dividends: divValues[10] ?? null, + foreign_tax_paid: roundMoney((divValues[12] ?? 0) + (intValues[5] ?? 0)), + tax_exempt_interest: roundMoney((divValues[15] ?? 0) + (intValues[6] ?? 0)), + specified_private_activity_bond_interest: roundMoney( + (divValues[16] ?? 0) + (intValues[7] ?? 0) + ), + short_term_covered_proceeds: shortCovered[0] ?? null, + short_term_covered_basis: shortCovered[1] ?? null, + short_term_wash_sale_adjustments: shortCovered[3] ?? null, + short_term_covered_net_gain_loss: shortCovered[4] ?? null, + long_term_covered_proceeds: longCovered[0] ?? null, + long_term_covered_basis: longCovered[1] ?? null, + long_term_wash_sale_adjustments: longCovered[3] ?? null, + long_term_covered_net_gain_loss: longCovered[4] ?? null, + has_unsupported_brokerage_rows: + hasNonZeroValues(shortNoncovered) || + hasNonZeroValues(shortUnknown) || + hasNonZeroValues(longNoncovered) || + hasNonZeroValues(longUnknown) || + hasNonZeroValues(undetermined), + }), + }; +} + +function parseSchwabCompositeText(inline: string): ParsedDeterministicDocument | null { + const dividendDetailSection = sliceInlineLastBetween( + inline, + 'detail information of dividends and distributions', + 'detail information of interest income' + ); + const interestDetailSection = sliceInlineLastBetween( + inline, + 'detail information of interest income', + 'terms and conditions' + ); + + const dividendDetailValues = dividendDetailSection ? extractMoneyValues(dividendDetailSection) : []; + const interestDetailValues = interestDetailSection ? extractMoneyValues(interestDetailSection) : []; + const ordinaryDividends = dividendDetailValues[0] ?? null; + const qualifiedDividends = dividendDetailValues[1] ?? null; + const interestIncome = interestDetailValues[0] ?? null; + + if (ordinaryDividends === null || qualifiedDividends === null || interestIncome === null) { + return null; + } + + return { + taxYear: extractReportingYear(inline), + payerOrIssuer: 'CHARLES SCHWAB & CO., INC.', + confidence: 'high', + fields: compactFields({ + interest_income: interestIncome, + ordinary_dividends: ordinaryDividends, + qualified_dividends: qualifiedDividends, + capital_gain_distributions: extractMoneyAfterLabel(inline, 'total capital gain distributions'), + nondividend_distributions: extractMoneyAfterLabel(inline, 'nondividend distributions'), + federal_withholding: roundMoney( + sumValues([ + ...extractAllMoneyAfterLabel(inline, 'federal income tax withheld'), + ]) + ), + section_199a_dividends: extractMoneyAfterLabel(inline, 'section 199a dividends'), + foreign_tax_paid: roundMoney(sumValues(extractAllMoneyAfterLabel(inline, 'foreign tax paid'))), + tax_exempt_interest: roundMoney( + sumValues([ + ...extractAllMoneyAfterLabel(inline, 'exempt-interest dividends'), + ...extractAllMoneyAfterLabel(inline, 'tax-exempt interest'), + ]) + ), + specified_private_activity_bond_interest: roundMoney( + sumValues([ + ...extractAllMoneyAfterLabel( + inline, + 'specified private activity bond interest dividends' + ), + ...extractAllMoneyAfterLabel(inline, 'specified private activity bond interest'), + ]) + ), + short_term_covered_proceeds: 0, + short_term_covered_basis: 0, + short_term_wash_sale_adjustments: 0, + short_term_covered_net_gain_loss: 0, + long_term_covered_proceeds: 0, + long_term_covered_basis: 0, + long_term_wash_sale_adjustments: 0, + long_term_covered_net_gain_loss: 0, + has_unsupported_brokerage_rows: false, + }), + }; +} + +function detectCompositeProvider(inline: string, fileName: string): string { + const lowerFileName = fileName.toLowerCase(); + if ( + inline.includes('fidelity brokerage services llc') || + inline.includes('national financial services llc') || + lowerFileName.includes('fidelity') + ) { + return 'fidelity'; + } + if (inline.includes("payer's information: pershing llc") || lowerFileName.includes('pershing')) { + return 'pershing'; + } + if (inline.includes('robinhood markets inc') || lowerFileName.includes('robinhood')) { + return 'robinhood'; + } + if ( + inline.includes('charles schwab & co., inc.') || + inline.includes('schwab one account') || + lowerFileName.includes('schwab') + ) { + return 'schwab'; + } + return 'unknown'; +} + +function readPdfText(filePath: string): string | null { + try { + return execFileSync('pdftotext', [filePath, '-'], { + encoding: 'utf-8', + stdio: ['ignore', 'pipe', 'ignore'], + maxBuffer: MAX_PDF_TEXT_BYTES, + }); + } catch { + return null; + } +} + +function normalizeInline(text: string): string { + return text + .toLowerCase() + .replace(/\u00a0/g, ' ') + .replace(/\s+/g, ' ') + .trim(); +} + +function extractMoneyAfterLabel(text: string, label: string): number | null { + const matches = extractAllMoneyAfterLabel(text, label); + return matches.length > 0 ? matches[0] : null; +} + +function extractMoneyAfterLooseLabel(text: string, label: string): number | null { + const pattern = new RegExp( + `${escapeRegExp(label.toLowerCase())}.{0,120}?((?:\\(|-)?\\$?\\d[\\d,]*\\.\\d{2}\\)?)` + ); + const match = text.match(pattern); + return match ? parseMoneyString(match[1]) : null; +} + +function extractAllMoneyAfterLabel(text: string, label: string): number[] { + const pattern = new RegExp( + `${escapeRegExp(label.toLowerCase())}[^\\d$()\\-]{0,120}((?:\\(|-)?\\$?\\d[\\d,]*\\.\\d{2}\\)?)`, + 'g' + ); + const results: number[] = []; + for (const match of text.matchAll(pattern)) { + results.push(parseMoneyString(match[1])); + } + return results; +} + +function extractDateAfterLabel(text: string, label: string): string | null { + const pattern = new RegExp( + `${escapeRegExp(label.toLowerCase())}[^0-9]{0,40}(\\d{1,2}/\\d{1,2}/\\d{4})` + ); + const match = text.match(pattern); + if (!match) { + return null; + } + const [month, day, year] = match[1].split('/').map(Number); + return `${year.toString().padStart(4, '0')}-${month.toString().padStart(2, '0')}-${day + .toString() + .padStart(2, '0')}`; +} + +function extractMoneyValues(text: string): number[] { + const matches = text.match(/(?:\(|-)?\$?\d[\d,]*\.\d{2}\)?/g); + return matches ? matches.map(parseMoneyString) : []; +} + +function extractDollarMoneyValues(text: string): number[] { + const matches = text.match(/\(?\$-?\d[\d,]*\.\d{2}\)?/g); + return matches ? matches.map(parseMoneyString) : []; +} + +function parseMoneyString(raw: string): number { + const negative = raw.includes('(') || raw.trim().startsWith('-'); + const cleaned = raw.replace(/[\s$,()]/g, '').replace(/^-/, ''); + const value = Number(cleaned); + return negative ? -value : value; +} + +function sliceInlineBetween(text: string, startMarker: string, endMarker: string): string | null { + const start = text.indexOf(startMarker.toLowerCase()); + if (start === -1) { + return null; + } + const fromStart = text.slice(start); + const endRelative = fromStart.indexOf(endMarker.toLowerCase()); + if (endRelative === -1) { + return fromStart; + } + return fromStart.slice(0, endRelative); +} + +function sliceInlineLastBetween(text: string, startMarker: string, endMarker: string): string | null { + const start = text.lastIndexOf(startMarker.toLowerCase()); + if (start === -1) { + return null; + } + const fromStart = text.slice(start); + const endRelative = fromStart.indexOf(endMarker.toLowerCase()); + if (endRelative === -1) { + return fromStart; + } + return fromStart.slice(0, endRelative); +} + +function extractReportingYear(text: string): number | null { + const patterns = [ + /tax year[^0-9]{0,20}(20\d{2})/i, + /for tax year[^0-9]{0,20}(20\d{2})/i, + /(?:^|\s)(20\d{2})\s+tax reporting statement/i, + /(?:^|\s)(20\d{2})\s+1099-(?:div|int|b|misc|oid|r)\b/i, + /for fiscal year [a-z]+ \d{1,2}, (20\d{2})/i, + ]; + + for (const pattern of patterns) { + const match = text.match(pattern); + if (match) { + return Number(match[1]); + } + } + + const fallback = text.match(/\b(20\d{2})\b/); + return fallback ? Number(fallback[1]) : null; +} + +function chunk(values: number[], size: number): number[][] { + const result: number[][] = []; + for (let index = 0; index < values.length; index += size) { + const slice = values.slice(index, index + size); + if (slice.length === size) { + result.push(slice); + } + } + return result; +} + +function hasNonZeroValues(values: number[]): boolean { + return values.some((value) => Math.abs(value) > 0); +} + +function sumValues(values: number[]): number { + return values.reduce((sum, value) => sum + value, 0); +} + +function roundMoney(value: number): number { + return Math.round(value * 100) / 100; +} + +function compactFields(fields: Record): Record { + return Object.fromEntries( + Object.entries(fields).filter(([, value]) => value !== null && value !== undefined) + ); +} + +function firstNonNull(...values: Array): T | null { + for (const value of values) { + if (value !== null) { + return value; + } + } + return null; +} + +function escapeRegExp(value: string): string { + return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); +} diff --git a/plugins/tax/src/extraction/live-openai.ts b/plugins/tax/src/extraction/live-openai.ts new file mode 100644 index 0000000..609382b --- /dev/null +++ b/plugins/tax/src/extraction/live-openai.ts @@ -0,0 +1,285 @@ +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import type { DocumentInventoryItem, DocumentType, ExtractedDocument } from '../types.js'; + +const OPENAI_API_URL = 'https://api.openai.com/v1/responses'; +const DEFAULT_MODEL = process.env.CRAB_TAX_MODEL || 'gpt-5.4'; + +const SUPPORTED_LIVE_TYPES = new Set([ + 'W-2', + '1099-INT', + '1099-DIV', + '1098', + '1099-B', + '1099-R', + '5498', +]); + +export function canExtractLiveDocumentType(documentType: DocumentType): boolean { + return SUPPORTED_LIVE_TYPES.has(documentType); +} + +export async function extractLiveDocument(args: { + inputDir: string; + document: DocumentInventoryItem; +}): Promise { + const apiKey = process.env.OPENAI_API_KEY; + if (!apiKey) { + return null; + } + if (!canExtractLiveDocumentType(args.document.detectedFormType)) { + return null; + } + + const absolutePath = path.join(args.inputDir, args.document.filePath); + const contentPart = buildContentPart(absolutePath); + + const prompt = buildExtractionPrompt(args.document.detectedFormType); + + const response = await fetch(OPENAI_API_URL, { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + model: DEFAULT_MODEL, + input: [ + { + role: 'user', + content: [ + contentPart, + { + type: 'input_text', + text: prompt, + }, + ], + }, + ], + }), + }); + + if (!response.ok) { + throw new Error(`OpenAI extraction request failed: ${response.status} ${await response.text()}`); + } + + const payload = (await response.json()) as { + output_text?: string; + output?: Array<{ + content?: Array<{ + type?: string; + text?: string; + }>; + }>; + }; + + const outputText = payload.output_text || flattenOutputText(payload.output || []); + const parsed = parseJsonObject(outputText); + + return { + schemaVersion: args.document.schemaVersion, + documentId: args.document.id, + documentType: args.document.detectedFormType, + taxYear: typeof parsed.taxYear === 'number' ? parsed.taxYear : args.document.taxYear, + payerOrIssuer: typeof parsed.payerOrIssuer === 'string' ? parsed.payerOrIssuer : null, + extractionMethod: 'live-openai', + confidence: parsed.confidence === 'low' || parsed.confidence === 'medium' ? parsed.confidence : 'high', + fields: normalizeFields(args.document.filePath, parsed.fields || {}), + }; +} + +export async function reviewLiveDocumentTaxYear(args: { + inputDir: string; + document: DocumentInventoryItem; +}): Promise<{ + taxYear: number | null; + confidence: 'high' | 'medium' | 'low'; + rationale: string; +} | null> { + const apiKey = process.env.OPENAI_API_KEY; + if (!apiKey) { + return null; + } + + const absolutePath = path.join(args.inputDir, args.document.filePath); + const contentPart = buildContentPart(absolutePath); + + const response = await fetch(OPENAI_API_URL, { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + model: DEFAULT_MODEL, + input: [ + { + role: 'user', + content: [ + contentPart, + { + type: 'input_text', + text: + 'Determine the reporting tax year of this tax document. Return JSON only with keys taxYear, confidence, and rationale. Do not confuse the form revision date, such as "(Rev. January 2024)", with the reporting tax year. If the reporting year is not visible, return taxYear as null.', + }, + ], + }, + ], + }), + }); + + if (!response.ok) { + throw new Error(`OpenAI tax-year review failed: ${response.status} ${await response.text()}`); + } + + const payload = (await response.json()) as { + output_text?: string; + output?: Array<{ + content?: Array<{ + type?: string; + text?: string; + }>; + }>; + }; + const outputText = payload.output_text || flattenOutputText(payload.output || []); + const parsed = parseJsonObject(outputText) as { + taxYear?: unknown; + confidence?: unknown; + rationale?: unknown; + }; + + return { + taxYear: typeof parsed.taxYear === 'number' ? parsed.taxYear : null, + confidence: normalizeConfidence(parsed.confidence), + rationale: typeof parsed.rationale === 'string' ? parsed.rationale : '', + }; +} + +function normalizeFields( + sourceFile: string, + fields: Record +) { + return Object.fromEntries( + Object.entries(fields).map(([key, field]) => [ + key, + { + value: normalizeValue(field.value), + confidence: normalizeConfidence(field.confidence), + sourceFile, + sourcePage: typeof field.sourcePage === 'number' ? field.sourcePage : 1, + }, + ]) + ); +} + +function normalizeValue(value: unknown): number | string | boolean | null { + if (typeof value === 'number' || typeof value === 'boolean') { + return value; + } + if (typeof value === 'string') { + const trimmed = value.replace(/[$,]/g, '').trim(); + if (/^-?\d+(\.\d+)?$/.test(trimmed)) { + return Number(trimmed); + } + return value; + } + if (value === null) { + return null; + } + return JSON.stringify(value); +} + +function buildContentPart(absolutePath: string) { + const ext = path.extname(absolutePath).toLowerCase(); + const bytes = fs.readFileSync(absolutePath); + const base64 = bytes.toString('base64'); + + return ext === '.pdf' + ? { + type: 'input_file' as const, + filename: path.basename(absolutePath), + file_data: `data:application/pdf;base64,${base64}`, + } + : { + type: 'input_image' as const, + image_url: `data:${detectMimeType(ext)};base64,${base64}`, + }; +} + +function flattenOutputText( + output: Array<{ + content?: Array<{ + type?: string; + text?: string; + }>; + }> +): string { + const parts: string[] = []; + for (const item of output) { + for (const content of item.content || []) { + if (content.type === 'output_text' && content.text) { + parts.push(content.text); + } + } + } + return parts.join('\n'); +} + +function parseJsonObject(raw: string): { + taxYear?: unknown; + payerOrIssuer?: unknown; + confidence?: unknown; + fields?: Record; +} { + const start = raw.indexOf('{'); + const end = raw.lastIndexOf('}'); + if (start === -1 || end === -1 || end <= start) { + throw new Error(`OpenAI extraction did not return JSON: ${raw.slice(0, 200)}`); + } + return JSON.parse(raw.slice(start, end + 1)); +} + +function detectMimeType(ext: string): string { + switch (ext) { + case '.png': + return 'image/png'; + case '.jpg': + case '.jpeg': + return 'image/jpeg'; + case '.webp': + return 'image/webp'; + default: + return 'application/octet-stream'; + } +} + +function normalizeConfidence(value: unknown): 'high' | 'medium' | 'low' { + if (value === 'low' || value === 'medium' || value === 'high') { + return value; + } + return 'high'; +} + +function buildExtractionPrompt(documentType: DocumentType): string { + const common = + 'Extract the tax document into JSON only. Do not include markdown fences. Return an object with keys taxYear, payerOrIssuer, confidence, and fields. The fields object must map field names to { value, sourcePage, confidence }.'; + + switch (documentType) { + case 'W-2': + return `${common} For W-2 include: box1_wages, box2_federal_withholding, box12_code_d if present, state_ca_wages, state_ca_withholding.`; + case '1099-INT': + return `${common} For 1099-INT include: interest_income, federal_withholding if present, foreign_tax_paid if present, tax_exempt_interest if present, specified_private_activity_bond_interest if present.`; + case '1099-DIV': + return `${common} For 1099-DIV include: ordinary_dividends, qualified_dividends, capital_gain_distributions, federal_withholding if present, foreign_tax_paid if present, tax_exempt_interest if present, specified_private_activity_bond_interest if present, section_199a_dividends if present, nondividend_distributions if present.`; + case '1098': + return `${common} For 1098 include: mortgage_interest_received, property_tax_paid if present, points_paid if present.`; + case '1099-B': + return `${common} For 1099-B include summary bucket fields when available: short_term_covered_proceeds, short_term_covered_basis, short_term_wash_sale_adjustments, short_term_covered_net_gain_loss if available, long_term_covered_proceeds, long_term_covered_basis, long_term_wash_sale_adjustments, long_term_covered_net_gain_loss if available, federal_withholding if present.`; + case '1099-R': + return `${common} For 1099-R include: gross_distribution, taxable_amount, federal_withholding if present, state_withholding if present, distribution_code if present.`; + case '5498': + return `${common} For 5498 include: account_type and contribution_amount. Use account_type values traditional_ira or roth_ira when clear.`; + default: + return `${common} Extract the most relevant numeric fields for this tax form.`; + } +} diff --git a/plugins/tax/src/extraction/mock.ts b/plugins/tax/src/extraction/mock.ts new file mode 100644 index 0000000..1d53908 --- /dev/null +++ b/plugins/tax/src/extraction/mock.ts @@ -0,0 +1,73 @@ +import * as fs from 'node:fs'; +import * as path from 'node:path'; +import type { DocumentInventoryItem, DocumentType, ExtractedDocument } from '../types.js'; + +interface MockSidecarFile { + payerOrIssuer?: string; + taxYear?: number; + confidence?: 'high' | 'medium' | 'low'; + fields: Record< + string, + { + value: number | string | boolean | null; + sourcePage?: number; + confidence?: 'high' | 'medium' | 'low'; + } + >; +} + +const SUPPORTED_MOCK_TYPES = new Set([ + 'W-2', + '1099-INT', + '1099-DIV', + '1098', + '1099-B', + '1099-composite', + '1099-R', + '5498', + 'property-tax-bill', +]); + +export function canExtractMockDocumentType(documentType: DocumentType): boolean { + return SUPPORTED_MOCK_TYPES.has(documentType); +} + +export function extractMockDocument(args: { + inputDir: string; + document: DocumentInventoryItem; +}): ExtractedDocument | null { + const { inputDir, document } = args; + if (!canExtractMockDocumentType(document.detectedFormType)) { + return null; + } + + const fullPath = path.join(inputDir, document.filePath); + const sidecarPath = `${fullPath}.mock.json`; + if (!fs.existsSync(sidecarPath)) { + return null; + } + + const sidecar = JSON.parse(fs.readFileSync(sidecarPath, 'utf-8')) as MockSidecarFile; + const fields = Object.fromEntries( + Object.entries(sidecar.fields).map(([key, field]) => [ + key, + { + value: field.value, + confidence: field.confidence || sidecar.confidence || 'high', + sourceFile: document.filePath, + sourcePage: field.sourcePage || 1, + }, + ]) + ); + + return { + schemaVersion: document.schemaVersion, + documentId: document.id, + documentType: document.detectedFormType, + taxYear: sidecar.taxYear ?? document.taxYear, + payerOrIssuer: sidecar.payerOrIssuer ?? document.issuerOrPayer, + extractionMethod: 'mock', + confidence: sidecar.confidence || document.confidence, + fields, + }; +} diff --git a/plugins/tax/src/index.ts b/plugins/tax/src/index.ts new file mode 100644 index 0000000..e63a8d4 --- /dev/null +++ b/plugins/tax/src/index.ts @@ -0,0 +1,2 @@ +export * from './types.js'; +export * from './app/run-pipeline.js'; diff --git a/plugins/tax/src/ingestion/classify-document.ts b/plugins/tax/src/ingestion/classify-document.ts new file mode 100644 index 0000000..19b8e99 --- /dev/null +++ b/plugins/tax/src/ingestion/classify-document.ts @@ -0,0 +1,187 @@ +import { execFileSync } from 'node:child_process'; +import * as path from 'node:path'; +import type { DocumentType } from '../types.js'; + +interface ClassificationResult { + detectedFormType: DocumentType; + confidence: 'high' | 'medium' | 'low'; +} + +const MAX_PDF_TEXT_BYTES = 1024 * 1024; + +export function classifyDocument(filePath: string): ClassificationResult { + const filenameResult = classifyByFileName(path.basename(filePath)); + const contentResult = classifyByContent(filePath); + + if (contentResult.detectedFormType !== 'unknown') { + if (filenameResult.detectedFormType === 'unknown') { + return contentResult; + } + if (contentResult.detectedFormType !== filenameResult.detectedFormType) { + return contentResult; + } + if (contentResult.confidence === 'high') { + return contentResult; + } + } + + return filenameResult.detectedFormType !== 'unknown' ? filenameResult : contentResult; +} + +export function classifyTextSnippet(text: string): ClassificationResult { + const normalized = normalize(text); + + if (!normalized) { + return unknown(); + } + + if ( + includesAll(normalized, ['property tax bill']) || + includesAll(normalized, ['secured', 'bill']) || + includesAll(normalized, ['tax collector']) || + includesAll(normalized, ['treasurer', 'property location']) + ) { + return { detectedFormType: 'property-tax-bill', confidence: 'high' }; + } + + if ( + includesAll(normalized, ['1099 composite']) || + includesAll(normalized, ['form 1099 composite']) || + includesAll(normalized, ['consolidated form 1099']) || + (includesAll(normalized, ['tax reporting statement']) && + (includesAll(normalized, ['1099-div']) || + includesAll(normalized, ['1099-b']) || + includesAll(normalized, ['1099-int']))) || + includesAll(normalized, ['year-end statement', '1099-div']) + ) { + return { detectedFormType: '1099-composite', confidence: 'high' }; + } + + if ( + includesAll(normalized, ['form w-2']) || + includesAll(normalized, ['wages, tips, other compensation']) + ) { + return { detectedFormType: 'W-2', confidence: 'high' }; + } + + if ( + includesAll(normalized, ['form 1099-r']) || + includesAll(normalized, ['distributions from pensions']) || + includesAll(normalized, ['gross distribution', 'taxable amount']) + ) { + return { detectedFormType: '1099-R', confidence: 'high' }; + } + + if ( + includesAll(normalized, ['form 1098']) || + includesAll(normalized, ['mortgage interest statement']) || + includesAll(normalized, ['mortgage interest received']) + ) { + return { detectedFormType: '1098', confidence: 'high' }; + } + + if ( + includesAll(normalized, ['form 1099-int']) || + (includesAll(normalized, ['1099-int']) && includesAll(normalized, ['interest income'])) + ) { + return { detectedFormType: '1099-INT', confidence: 'high' }; + } + + if ( + includesAll(normalized, ['form 1099-div']) || + includesAll(normalized, ['dividends and distributions']) + ) { + return { detectedFormType: '1099-DIV', confidence: 'high' }; + } + + if ( + includesAll(normalized, ['form 1099-b']) || + includesAll(normalized, ['proceeds from broker']) || + includesAll(normalized, ['sales proceeds']) + ) { + return { detectedFormType: '1099-B', confidence: 'high' }; + } + + if ( + includesAll(normalized, ['form 5498']) || + includesAll(normalized, ['ira contribution information']) + ) { + return { detectedFormType: '5498', confidence: 'high' }; + } + + if ( + includesAll(normalized, ['form 1040']) || + includesAll(normalized, ['u.s. individual income tax return']) + ) { + return { detectedFormType: 'prior-year-return', confidence: 'high' }; + } + + return unknown(); +} + +function classifyByFileName(fileName: string): ClassificationResult { + const lower = fileName.toLowerCase(); + + const matches: Array<[DocumentType, RegExp]> = [ + ['property-tax-bill', /(property.*tax.*bill|secured[-_ ]bill|tax[-_ ]bill)/], + ['1099-composite', /(1099.*(composite|consolidated)|(composite|consolidated).*(1099|tax reporting statement))/], + ['W-2', /(^|[^0-9])w[\s_-]?2([^0-9]|$)/], + ['1099-INT', /1099[\s_-]?int/], + ['1099-DIV', /1099[\s_-]?div/], + ['1099-B', /1099[\s_-]?b/], + ['1098', /(^|[^0-9])1098([^0-9]|$)/], + ['5498', /(^|[^0-9])5498([^0-9]|$)/], + ['1099-R', /1099[\s_-]?r/], + ['prior-year-return', /(prior|previous).*(return)|return.*2024|2024.*return/], + ]; + + for (const [type, pattern] of matches) { + if (pattern.test(lower)) { + return { detectedFormType: type, confidence: 'medium' }; + } + } + + return unknown(); +} + +function classifyByContent(filePath: string): ClassificationResult { + const ext = path.extname(filePath).toLowerCase(); + if (ext !== '.pdf') { + return unknown(); + } + + const text = readPdfText(filePath); + return text ? classifyTextSnippet(text) : unknown(); +} + +function readPdfText(filePath: string): string | null { + try { + const output = execFileSync( + 'pdftotext', + ['-f', '1', '-l', '3', filePath, '-'], + { + encoding: 'utf-8', + stdio: ['ignore', 'pipe', 'ignore'], + maxBuffer: MAX_PDF_TEXT_BYTES, + } + ); + return output.slice(0, MAX_PDF_TEXT_BYTES); + } catch { + return null; + } +} + +function normalize(text: string): string { + return text + .toLowerCase() + .replace(/\s+/g, ' ') + .trim(); +} + +function includesAll(text: string, parts: string[]): boolean { + return parts.every((part) => text.includes(part)); +} + +function unknown(): ClassificationResult { + return { detectedFormType: 'unknown', confidence: 'low' }; +} diff --git a/plugins/tax/src/ingestion/list-input-files.ts b/plugins/tax/src/ingestion/list-input-files.ts new file mode 100644 index 0000000..874796e --- /dev/null +++ b/plugins/tax/src/ingestion/list-input-files.ts @@ -0,0 +1,31 @@ +import * as fs from 'node:fs'; +import * as path from 'node:path'; + +const SUPPORTED_EXTENSIONS = new Set([ + '.pdf', + '.png', + '.jpg', + '.jpeg', + '.webp', +]); + +export function listInputFiles(inputDir: string): string[] { + const entries = fs.readdirSync(inputDir, { withFileTypes: true }); + const files: string[] = []; + + for (const entry of entries) { + if (!entry.isFile()) { + continue; + } + if (entry.name.endsWith('.mock.json')) { + continue; + } + const ext = path.extname(entry.name).toLowerCase(); + if (!SUPPORTED_EXTENSIONS.has(ext)) { + continue; + } + files.push(path.join(inputDir, entry.name)); + } + + return files.sort((a, b) => a.localeCompare(b)); +} diff --git a/plugins/tax/src/rules/california-2025.ts b/plugins/tax/src/rules/california-2025.ts new file mode 100644 index 0000000..3a42f91 --- /dev/null +++ b/plugins/tax/src/rules/california-2025.ts @@ -0,0 +1,47 @@ +import type { FilingStatus } from '../types.js'; + +export const CALIFORNIA_STANDARD_DEDUCTION_2025: Record = { + single: 5706, + mfj: 11412, +}; + +export const CALIFORNIA_TAX_BRACKETS_2025: Record< + FilingStatus, + Array<{ upTo: number; rate: number }> +> = { + single: [ + { upTo: 11079, rate: 0.01 }, + { upTo: 26264, rate: 0.02 }, + { upTo: 41452, rate: 0.04 }, + { upTo: 57542, rate: 0.06 }, + { upTo: 72724, rate: 0.08 }, + { upTo: 371479, rate: 0.093 }, + { upTo: 445771, rate: 0.103 }, + { upTo: 742953, rate: 0.113 }, + { upTo: Number.POSITIVE_INFINITY, rate: 0.123 }, + ], + mfj: [ + { upTo: 22158, rate: 0.01 }, + { upTo: 52528, rate: 0.02 }, + { upTo: 82904, rate: 0.04 }, + { upTo: 115084, rate: 0.06 }, + { upTo: 145448, rate: 0.08 }, + { upTo: 742958, rate: 0.093 }, + { upTo: 891542, rate: 0.103 }, + { upTo: 1485906, rate: 0.113 }, + { upTo: Number.POSITIVE_INFINITY, rate: 0.123 }, + ], +}; + +export const CALIFORNIA_PERSONAL_EXEMPTION_CREDIT_2025 = 153; +export const CALIFORNIA_DEPENDENT_EXEMPTION_CREDIT_2025 = 475; + +export const CALIFORNIA_EXEMPTION_CREDIT_AGI_THRESHOLD_2025: Record = { + single: 252203, + mfj: 504411, +}; + +export const CALIFORNIA_ITEMIZED_LIMIT_AGI_THRESHOLD_2025: Record = { + single: 252203, + mfj: 504411, +}; diff --git a/plugins/tax/src/rules/federal-2025.ts b/plugins/tax/src/rules/federal-2025.ts new file mode 100644 index 0000000..616c464 --- /dev/null +++ b/plugins/tax/src/rules/federal-2025.ts @@ -0,0 +1,72 @@ +import type { FilingStatus } from '../types.js'; + +export interface FilingThresholds { + single: number; + mfj: number; +} + +export const FEDERAL_STANDARD_DEDUCTION_2025: Record = { + single: 15750, + mfj: 31500, +}; + +export const FEDERAL_ORDINARY_BRACKETS_2025: Record< + FilingStatus, + Array<{ upTo: number; rate: number }> +> = { + single: [ + { upTo: 11925, rate: 0.1 }, + { upTo: 48475, rate: 0.12 }, + { upTo: 103350, rate: 0.22 }, + { upTo: 197300, rate: 0.24 }, + { upTo: 250525, rate: 0.32 }, + { upTo: 626350, rate: 0.35 }, + { upTo: Number.POSITIVE_INFINITY, rate: 0.37 }, + ], + mfj: [ + { upTo: 23850, rate: 0.1 }, + { upTo: 96950, rate: 0.12 }, + { upTo: 206700, rate: 0.22 }, + { upTo: 394600, rate: 0.24 }, + { upTo: 501050, rate: 0.32 }, + { upTo: 751600, rate: 0.35 }, + { upTo: Number.POSITIVE_INFINITY, rate: 0.37 }, + ], +}; + +export const FEDERAL_QD_ZERO_RATE_THRESHOLD_2025: Record = { + single: 48350, + mfj: 96700, +}; + +export const FEDERAL_QD_FIFTEEN_RATE_THRESHOLD_2025: Record = { + single: 533400, + mfj: 600050, +}; + +export const FEDERAL_SALT_LIMIT_2025: Record = { + single: 40000, + mfj: 40000, +}; + +export const FEDERAL_SALT_PHASEDOWN_AGI_2025: FilingThresholds = { + single: 500000, + mfj: 500000, +}; + +export const FEDERAL_SALT_MIN_FLOOR_2025: Record = { + single: 10000, + mfj: 10000, +}; + +export const TRADITIONAL_IRA_PHASEOUT_START_2025: Record = { + single: 89000, + mfj: 146000, +}; + +export const TRADITIONAL_IRA_PHASEOUT_WIDTH_2025: Record = { + single: 10000, + mfj: 20000, +}; + +export const TRADITIONAL_IRA_MAX_CONTRIBUTION_UNDER_50_2025 = 7000; diff --git a/plugins/tax/src/types.ts b/plugins/tax/src/types.ts new file mode 100644 index 0000000..ef49dbf --- /dev/null +++ b/plugins/tax/src/types.ts @@ -0,0 +1,136 @@ +export type FilingStatus = 'single' | 'mfj'; + +export interface ScenarioFlags { + rsu: boolean; + espp: boolean; + inheritedShares: boolean; +} + +export interface EstimatedPayment { + jurisdiction: 'federal' | 'california'; + amount: number; +} + +export interface IraContribution { + accountType: 'traditional_ira' | 'roth_ira'; + taxYear: number; + amount: number; +} + +export interface TaxpayerProfile { + schemaVersion: string; + taxYear: number; + filingStatus: FilingStatus; + state: 'CA'; + fullYearResident: boolean; + dependents: number; + estimatedPayments: EstimatedPayment[]; + iraContributions: IraContribution[]; + scenarioFlags: ScenarioFlags; + reviewAnswers: Record; +} + +export type DocumentType = + | 'W-2' + | '1099-INT' + | '1099-DIV' + | '1099-B' + | '1099-composite' + | '1098' + | '5498' + | '1099-R' + | 'property-tax-bill' + | 'prior-year-return' + | 'unknown'; + +export interface DocumentInventoryItem { + schemaVersion: string; + id: string; + filePath: string; + fileName: string; + detectedFormType: DocumentType; + issuerOrPayer: string | null; + taxYear: number | null; + pageCount: number | null; + extractionStatus: 'pending' | 'extracted' | 'missing_mock_data' | 'unsupported'; + confidence: 'high' | 'medium' | 'low'; +} + +export interface ExtractedField { + value: T; + confidence: 'high' | 'medium' | 'low'; + sourceFile: string; + sourcePage: number; +} + +export interface ExtractedDocument { + schemaVersion: string; + documentId: string; + documentType: DocumentType; + taxYear: number | null; + payerOrIssuer: string | null; + extractionMethod: 'mock' | 'live-openai' | 'deterministic'; + confidence: 'high' | 'medium' | 'low'; + fields: Record>; +} + +export interface ReviewIssue { + schemaVersion: string; + severity: 'warning' | 'blocking'; + code: string; + message: string; + impactedArea: string; + sourceReferences: string[]; + suggestedNextAction: string; +} + +export interface ReconciliationCheck { + name: string; + status: 'pass' | 'warning' | 'blocking'; + message: string; +} + +export interface ReconciliationReport { + schemaVersion: string; + taxYear: number; + confidence: 'low' | 'medium' | 'high'; + checks: ReconciliationCheck[]; + missingDocumentsLikelyRequired: string[]; + stageDecisionLog: string[]; +} + +export interface ValueWithMeta { + value: number | string | boolean | null; + derivationType: 'copied' | 'normalized' | 'computed' | 'user_provided'; + references: string[]; +} + +export interface EstimateSummary { + schemaVersion: string; + taxYear: number; + generatedAt: string; + inputFingerprint: string; + confidence: 'low' | 'medium' | 'high'; + federalRefundOrAmountOwed: ValueWithMeta; + caRefundOrAmountOwed: ValueWithMeta; + deductionChoice: { + federal: 'unknown' | 'standard' | 'itemized'; + california: 'unknown' | 'standard' | 'itemized'; + }; + optimizations: string[]; + blockingIssueCount: number; +} + +export interface RunPipelineOptions { + inputDir: string; + outputDir: string; + profile: TaxpayerProfile; + preview: boolean; + verbose: boolean; +} + +export interface RunPipelineResult { + exitCode: 0 | 1 | 2 | 3; + outputDir: string; + issues: ReviewIssue[]; +} diff --git a/plugins/tax/test/classify.mjs b/plugins/tax/test/classify.mjs new file mode 100644 index 0000000..7671457 --- /dev/null +++ b/plugins/tax/test/classify.mjs @@ -0,0 +1,40 @@ +import assert from 'node:assert/strict'; +import path from 'node:path'; + +const root = path.resolve(path.dirname(new URL(import.meta.url).pathname), '..'); +const { classifyTextSnippet } = await import(path.join(root, 'dist', 'ingestion', 'classify-document.js')); + +const cases = [ + { + name: 'composite brokerage', + text: 'FORM 1099 COMPOSITE & YEAR-END SUMMARY Form 1099-DIV Form 1099-INT Form 1099-B', + expectedType: '1099-composite', + }, + { + name: 'property tax bill', + text: 'City & County of San Francisco Property Tax Bill (Secured) Tax Collector', + expectedType: 'property-tax-bill', + }, + { + name: 'mortgage 1098', + text: 'Form 1098 Mortgage Interest Statement mortgage interest received by the recipient/lender', + expectedType: '1098', + }, + { + name: 'retirement 1099-r', + text: 'FORM 1099-R Distributions From Pensions, Annuities, Retirement or Profit-Sharing Plans', + expectedType: '1099-R', + }, +]; + +for (const testCase of cases) { + const result = classifyTextSnippet(testCase.text); + assert.equal( + result.detectedFormType, + testCase.expectedType, + `${testCase.name}: expected ${testCase.expectedType}, got ${result.detectedFormType}` + ); + assert.notEqual(result.confidence, 'low', `${testCase.name}: expected non-low confidence`); +} + +console.log('crab-tax classify: all snippet checks passed'); diff --git a/plugins/tax/test/deterministic.mjs b/plugins/tax/test/deterministic.mjs new file mode 100644 index 0000000..d7f11b9 --- /dev/null +++ b/plugins/tax/test/deterministic.mjs @@ -0,0 +1,161 @@ +import { execFileSync } from 'node:child_process'; +import path from 'node:path'; + +const root = path.resolve(path.dirname(new URL(import.meta.url).pathname), '..'); + +execFileSync('npm', ['run', 'build'], { + cwd: root, + stdio: 'inherit', +}); + +const { + parse1099CompositeText, + parsePropertyTaxBillText, +} = await import(path.join(root, 'dist', 'extraction', 'deterministic.js')); + +const fidelity = parse1099CompositeText( + ` + FIDELITY BROKERAGE SERVICES LLC + Payer's Name and Address: NATIONAL FINANCIAL SERVICES LLC + 2025 TAX REPORTING STATEMENT + 1a Total Ordinary Dividends ........ 418.26 + 1b Qualified Dividends ........ 292.44 + 2a Total Capital Gain Distributions ........ 225.53 + 3 Nondividend Distributions ........ 0.00 + 4 Federal Income Tax Withheld ........ 0.00 + 5 Section 199A Dividends ........ 8.89 + 7 Foreign Tax Paid ........ 18.97 + 12 Exempt Interest Dividends ........ 43.01 + 13 Specified Private Activity Bond Interest Dividends ........ 3.62 + 2025 Interest Income + 1 Interest Income ........ 0.00 + 4 Federal Income Tax Withheld ........ 0.00 + 6 Foreign Tax Paid ........ 0.00 + 8 Tax-Exempt Interest ........ 0.00 + 9 Specified Private Activity Bond Interest ........ 0.00 + Summary of 2025 Proceeds From Broker and Barter Exchange Transactions + 6,744.62 6,784.99 0.00 0.00 -40.37 0.00 + 0.00 0.00 0.00 0.00 0.00 0.00 + 24,785.66 15,468.30 0.00 0.00 9,317.36 0.00 + 0.00 0.00 0.00 0.00 0.00 0.00 + 0.00 0.00 0.00 0.00 0.00 0.00 + 31,530.28 22,253.29 0.00 0.00 9,276.99 0.00 + 1099-B amounts are reported individually to the IRS. + `, + 'fidelity-composite.pdf' +); + +assertEqual(fidelity?.fields.ordinary_dividends, 418.26, 'fidelity ordinary dividends'); +assertEqual(fidelity?.fields.long_term_covered_net_gain_loss, 9317.36, 'fidelity long-term net'); +assertEqual( + fidelity?.fields.has_unsupported_brokerage_rows, + false, + 'fidelity unsupported rows' +); + +const pershing = parse1099CompositeText( + ` + Payer's Information: PERSHING LLC + 2025 1099-DIV + 2025 1099-INT OMB No. 1545-0112 Interest Income Box Amount + $0.00 $0.00 $0.00 $0.00 $0.00 $0.00 $0.00 + Box 1a + Dividends and Distributions OMB No. 1545-0110 Amount + $233.92 $201.16 $0.00 $0.00 $0.00 $0.00 $0.00 $0.00 $1.64 $0.00 $16.80 $0.00 $0.00 $0.00 $0.00 $0.00 $0.00 + Summary of Form 1099-OID + Short-Term Covered Total $491.93 $547.96 ($56.03) + Long-Term Covered Total $809.86 $650.13 $159.73 + TAX LOT DEFAULT DISPOSITION METHOD + `, + 'pershing-composite.pdf' +); + +assertEqual(pershing?.fields.qualified_dividends, 201.16, 'pershing qualified dividends'); +assertEqual(pershing?.fields.short_term_covered_basis, 547.96, 'pershing short basis'); +assertEqual( + pershing?.fields.long_term_covered_net_gain_loss, + 159.73, + 'pershing long net' +); + +const robinhood = parse1099CompositeText( + ` + Robinhood Markets Inc + Enclosed is your 2025 Tax Statement. + 2025 1099-DIV + 71.95 71.95 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + 2025 1099-MISC + A (basis reported to the IRS) + B (basis not reported to the IRS) 0.00 0.00 0.00 0.00 0.00 + C (Form 1099-B not received) 0.00 0.00 0.00 0.00 0.00 + Total Short-term 0.00 0.00 0.00 0.00 0.00 + D (basis reported to the IRS) + E (basis not reported to the IRS) 0.00 0.00 0.00 0.00 0.00 + F (Form 1099-B not received) 0.00 0.00 0.00 0.00 0.00 + Total Long-term 0.00 0.00 0.00 0.00 0.00 + Total Undetermined-term 0.00 0.00 0.00 0.00 0.00 + Grand total + 2025 1099-INT + 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 + The following amounts are not reported to the IRS + `, + 'robinhood-composite.pdf' +); + +assertEqual(robinhood?.fields.ordinary_dividends, 71.95, 'robinhood ordinary dividends'); +assertEqual( + robinhood?.fields.short_term_covered_net_gain_loss, + 0, + 'robinhood short-term net' +); + +const schwab = parse1099CompositeText( + ` + Charles Schwab & Co., Inc. + TAX YEAR 2025 FORM 1099 COMPOSITE + Detail Information of Dividends and Distributions + META PLATFORMS INC + Total Qualified Dividends (Box 1b and included in Box 1a) + $507.15 + $507.15 + Total Ordinary Dividends (Box 1a) + $507.15 + Detail Information of Interest Income + DEPOSIT INTEREST + Total Interest Income (Included in Box 1) + $0.35 + Total Interest Income (Box 1) + $0.35 + Terms and Conditions + `, + 'schwab-composite.pdf' +); + +assertEqual(schwab?.fields.ordinary_dividends, 507.15, 'schwab ordinary dividends'); +assertEqual(schwab?.fields.qualified_dividends, 507.15, 'schwab qualified dividends'); +assertEqual(schwab?.fields.interest_income, 0.35, 'schwab interest income'); + +const propertyTaxBill = parsePropertyTaxBillText(` + City & County of San Francisco + Property Tax Bill (Secured) + For Fiscal Year July 1, 2025 through June 30, 2026 + Tax Amount $13,280.44 + Total Direct Charges and Special Assessments $875.48 + 2nd Installment Due Pay by April 10, 2026 $7,077.96 + 1st Installment Due December 10, 2025 $7,077.96 If paid after December 10, 2025 $7,785.75 Pay by Paid 11/29/2025 +`); + +assertEqual(propertyTaxBill?.fields.property_tax_paid, 6640.22, 'property tax paid'); +assertEqual( + propertyTaxBill?.fields.property_tax_first_installment_paid_date, + '2025-11-29', + 'property tax paid date' +); + +console.log('crab-tax deterministic parsers: all checks passed'); + +function assertEqual(actual, expected, label) { + if (actual !== expected) { + throw new Error(`${label}: expected ${expected}, got ${actual}`); + } +} diff --git a/plugins/tax/test/e2e.mjs b/plugins/tax/test/e2e.mjs new file mode 100644 index 0000000..df2fb26 --- /dev/null +++ b/plugins/tax/test/e2e.mjs @@ -0,0 +1,151 @@ +import { execFileSync } from 'node:child_process'; +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; + +const root = path.resolve(path.dirname(new URL(import.meta.url).pathname), '..'); +const distCli = path.join(root, 'dist', 'cli.js'); + +const cases = [ + { + name: 'basic', + fixture: path.join(root, 'testdata', 'e2e-basic'), + expectedExitCode: 0, + assertions(outputDir) { + const estimate = readJson(path.join(outputDir, 'estimate_summary.json')); + if (estimate.blockingIssueCount !== 0) { + throw new Error(`basic: expected no blocking issues, got ${estimate.blockingIssueCount}`); + } + const documents = readJson(path.join(outputDir, 'documents.json')); + if (documents.length !== 3) { + throw new Error(`basic: expected 3 documents, got ${documents.length}`); + } + }, + }, + { + name: 'extended', + fixture: path.join(root, 'testdata', 'e2e-extended'), + expectedExitCode: 0, + assertions(outputDir) { + const federal = readJson(path.join(outputDir, 'federal_return_inputs.json')); + if (federal.deduction_choice !== 'itemized') { + throw new Error(`extended: expected federal itemized deduction, got ${federal.deduction_choice}`); + } + if (federal.short_term_covered_net_gain_loss_preview.value !== 720) { + throw new Error('extended: expected short-term covered preview gain/loss of 720'); + } + }, + }, + { + name: 'retirement', + fixture: path.join(root, 'testdata', 'e2e-retirement'), + expectedExitCode: 0, + assertions(outputDir) { + const federal = readJson(path.join(outputDir, 'federal_return_inputs.json')); + if (federal.traditional_ira_deduction.value !== 7000) { + throw new Error('retirement: expected traditional IRA deduction of 7000'); + } + if (federal.retirement_taxable_amount.value !== 5000) { + throw new Error('retirement: expected taxable retirement amount of 5000'); + } + }, + }, + { + name: 'blocking-unsupported', + fixture: path.join(root, 'testdata', 'e2e-blocking-unsupported'), + expectedExitCode: 1, + assertions(outputDir) { + const estimate = readJson(path.join(outputDir, 'estimate_summary.json')); + if (estimate.blockingIssueCount < 1) { + throw new Error('blocking-unsupported: expected at least one blocking issue'); + } + if (estimate.confidence !== 'low') { + throw new Error(`blocking-unsupported: expected low confidence, got ${estimate.confidence}`); + } + const documents = readJson(path.join(outputDir, 'documents.json')); + const composite = documents.find((document) => document.fileName === 'brokerage-1099-composite-2025.pdf'); + if (!composite) { + throw new Error('blocking-unsupported: expected composite brokerage document in inventory'); + } + if (composite.detectedFormType !== '1099-composite') { + throw new Error( + `blocking-unsupported: expected detectedFormType 1099-composite, got ${composite.detectedFormType}` + ); + } + const issues = readJson(path.join(outputDir, 'issues_to_review.json')); + if (!issues.some((issue) => issue.severity === 'blocking')) { + throw new Error('blocking-unsupported: expected at least one blocking issue entry'); + } + }, + }, + { + name: 'tax-year-mismatch', + fixture: path.join(root, 'testdata', 'e2e-tax-year-mismatch'), + expectedExitCode: 1, + assertions(outputDir) { + const estimate = readJson(path.join(outputDir, 'estimate_summary.json')); + if (estimate.blockingIssueCount < 1) { + throw new Error('tax-year-mismatch: expected at least one blocking issue'); + } + const issues = readJson(path.join(outputDir, 'issues_to_review.json')); + if (!issues.some((issue) => issue.code === 'DOCUMENT_TAX_YEAR_MISMATCH')) { + throw new Error('tax-year-mismatch: expected tax-year mismatch issue'); + } + }, + }, +]; + +execFileSync('npm', ['run', 'build'], { + cwd: root, + stdio: 'inherit', +}); + +for (const testCase of cases) { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), `crab-tax-${testCase.name}-`)); + copyDir(testCase.fixture, tempDir); + const profilePath = path.join(tempDir, 'profile.json'); + const outputDir = path.join(tempDir, 'out'); + + let exitCode = 0; + try { + execFileSync('node', [distCli, tempDir, '--output', outputDir, '--profile', profilePath], { + cwd: root, + env: { + ...process.env, + OPENAI_API_KEY: '', + CRAB_TAX_MODEL: '', + CRAB_TAX_AGENT_MODEL: '', + }, + stdio: 'inherit', + }); + } catch (error) { + exitCode = error.status ?? 1; + } + + if (exitCode !== testCase.expectedExitCode) { + throw new Error( + `${testCase.name}: expected exit code ${testCase.expectedExitCode}, got ${exitCode}` + ); + } + + testCase.assertions(outputDir); +} + +console.log('crab-tax e2e: all fixture checks passed'); + +function copyDir(source, destination) { + fs.mkdirSync(destination, { recursive: true }); + for (const entry of fs.readdirSync(source, { withFileTypes: true })) { + const from = path.join(source, entry.name); + const to = path.join(destination, entry.name); + if (entry.isDirectory()) { + copyDir(from, to); + } else { + fs.copyFileSync(from, to); + } + } +} + +function readJson(filePath) { + return JSON.parse(fs.readFileSync(filePath, 'utf-8')); +} diff --git a/plugins/tax/testdata/e2e-basic/README.md b/plugins/tax/testdata/e2e-basic/README.md new file mode 100644 index 0000000..a7bb14f --- /dev/null +++ b/plugins/tax/testdata/e2e-basic/README.md @@ -0,0 +1,8 @@ +Fixture for end-to-end mock extraction tests. + +Files: +- `w2-2025.pdf` +- `chase-1099-int-2025.pdf` +- `fidelity-1099-div-2025.pdf` + +Each file has a `.mock.json` sidecar used by the current extraction slice. diff --git a/plugins/tax/testdata/e2e-basic/chase-1099-int-2025.pdf b/plugins/tax/testdata/e2e-basic/chase-1099-int-2025.pdf new file mode 100644 index 0000000..4cd7cc6 Binary files /dev/null and b/plugins/tax/testdata/e2e-basic/chase-1099-int-2025.pdf differ diff --git a/plugins/tax/testdata/e2e-basic/chase-1099-int-2025.pdf.mock.json b/plugins/tax/testdata/e2e-basic/chase-1099-int-2025.pdf.mock.json new file mode 100644 index 0000000..e762c2d --- /dev/null +++ b/plugins/tax/testdata/e2e-basic/chase-1099-int-2025.pdf.mock.json @@ -0,0 +1,15 @@ +{ + "payerOrIssuer": "Chase Bank", + "taxYear": 2025, + "confidence": "high", + "fields": { + "interest_income": { + "value": 412.34, + "sourcePage": 1 + }, + "federal_withholding": { + "value": 0, + "sourcePage": 1 + } + } +} diff --git a/plugins/tax/testdata/e2e-basic/fidelity-1099-div-2025.pdf b/plugins/tax/testdata/e2e-basic/fidelity-1099-div-2025.pdf new file mode 100644 index 0000000..ee07d8d Binary files /dev/null and b/plugins/tax/testdata/e2e-basic/fidelity-1099-div-2025.pdf differ diff --git a/plugins/tax/testdata/e2e-basic/fidelity-1099-div-2025.pdf.mock.json b/plugins/tax/testdata/e2e-basic/fidelity-1099-div-2025.pdf.mock.json new file mode 100644 index 0000000..e6eff28 --- /dev/null +++ b/plugins/tax/testdata/e2e-basic/fidelity-1099-div-2025.pdf.mock.json @@ -0,0 +1,19 @@ +{ + "payerOrIssuer": "Fidelity", + "taxYear": 2025, + "confidence": "high", + "fields": { + "ordinary_dividends": { + "value": 1840.12, + "sourcePage": 1 + }, + "qualified_dividends": { + "value": 1210.55, + "sourcePage": 1 + }, + "capital_gain_distributions": { + "value": 50, + "sourcePage": 1 + } + } +} diff --git a/plugins/tax/testdata/e2e-basic/profile.json b/plugins/tax/testdata/e2e-basic/profile.json new file mode 100644 index 0000000..6993311 --- /dev/null +++ b/plugins/tax/testdata/e2e-basic/profile.json @@ -0,0 +1,16 @@ +{ + "schemaVersion": "0.1.0", + "taxYear": 2025, + "filingStatus": "single", + "state": "CA", + "fullYearResident": true, + "dependents": 0, + "estimatedPayments": [], + "iraContributions": [], + "scenarioFlags": { + "rsu": false, + "espp": false, + "inheritedShares": false + }, + "reviewAnswers": {} +} diff --git a/plugins/tax/testdata/e2e-basic/w2-2025.pdf b/plugins/tax/testdata/e2e-basic/w2-2025.pdf new file mode 100644 index 0000000..bc6fdb3 Binary files /dev/null and b/plugins/tax/testdata/e2e-basic/w2-2025.pdf differ diff --git a/plugins/tax/testdata/e2e-basic/w2-2025.pdf.mock.json b/plugins/tax/testdata/e2e-basic/w2-2025.pdf.mock.json new file mode 100644 index 0000000..d1e70e1 --- /dev/null +++ b/plugins/tax/testdata/e2e-basic/w2-2025.pdf.mock.json @@ -0,0 +1,27 @@ +{ + "payerOrIssuer": "Example Employer Inc.", + "taxYear": 2025, + "confidence": "high", + "fields": { + "box1_wages": { + "value": 120000, + "sourcePage": 1 + }, + "box2_federal_withholding": { + "value": 18000, + "sourcePage": 1 + }, + "box12_code_d": { + "value": 23000, + "sourcePage": 1 + }, + "state_ca_wages": { + "value": 120000, + "sourcePage": 1 + }, + "state_ca_withholding": { + "value": 8200, + "sourcePage": 1 + } + } +} diff --git a/plugins/tax/testdata/e2e-blocking-unsupported/README.md b/plugins/tax/testdata/e2e-blocking-unsupported/README.md new file mode 100644 index 0000000..20fcb9d --- /dev/null +++ b/plugins/tax/testdata/e2e-blocking-unsupported/README.md @@ -0,0 +1 @@ +Fixture that proves material unsupported documents block the run. diff --git a/plugins/tax/testdata/e2e-blocking-unsupported/brokerage-1099-composite-2025.pdf b/plugins/tax/testdata/e2e-blocking-unsupported/brokerage-1099-composite-2025.pdf new file mode 100644 index 0000000..e699867 Binary files /dev/null and b/plugins/tax/testdata/e2e-blocking-unsupported/brokerage-1099-composite-2025.pdf differ diff --git a/plugins/tax/testdata/e2e-blocking-unsupported/profile.json b/plugins/tax/testdata/e2e-blocking-unsupported/profile.json new file mode 100644 index 0000000..6993311 --- /dev/null +++ b/plugins/tax/testdata/e2e-blocking-unsupported/profile.json @@ -0,0 +1,16 @@ +{ + "schemaVersion": "0.1.0", + "taxYear": 2025, + "filingStatus": "single", + "state": "CA", + "fullYearResident": true, + "dependents": 0, + "estimatedPayments": [], + "iraContributions": [], + "scenarioFlags": { + "rsu": false, + "espp": false, + "inheritedShares": false + }, + "reviewAnswers": {} +} diff --git a/plugins/tax/testdata/e2e-blocking-unsupported/w2-2025.pdf b/plugins/tax/testdata/e2e-blocking-unsupported/w2-2025.pdf new file mode 100644 index 0000000..a204a14 Binary files /dev/null and b/plugins/tax/testdata/e2e-blocking-unsupported/w2-2025.pdf differ diff --git a/plugins/tax/testdata/e2e-blocking-unsupported/w2-2025.pdf.mock.json b/plugins/tax/testdata/e2e-blocking-unsupported/w2-2025.pdf.mock.json new file mode 100644 index 0000000..d1e70e1 --- /dev/null +++ b/plugins/tax/testdata/e2e-blocking-unsupported/w2-2025.pdf.mock.json @@ -0,0 +1,27 @@ +{ + "payerOrIssuer": "Example Employer Inc.", + "taxYear": 2025, + "confidence": "high", + "fields": { + "box1_wages": { + "value": 120000, + "sourcePage": 1 + }, + "box2_federal_withholding": { + "value": 18000, + "sourcePage": 1 + }, + "box12_code_d": { + "value": 23000, + "sourcePage": 1 + }, + "state_ca_wages": { + "value": 120000, + "sourcePage": 1 + }, + "state_ca_withholding": { + "value": 8200, + "sourcePage": 1 + } + } +} diff --git a/plugins/tax/testdata/e2e-extended/README.md b/plugins/tax/testdata/e2e-extended/README.md new file mode 100644 index 0000000..f11ecd2 --- /dev/null +++ b/plugins/tax/testdata/e2e-extended/README.md @@ -0,0 +1,8 @@ +Fixture for extended mock extraction and preview aggregation tests. + +Includes: +- W-2 +- 1099-INT +- 1099-DIV +- 1098 +- 1099-B diff --git a/plugins/tax/testdata/e2e-extended/chase-1099-int-2025.pdf b/plugins/tax/testdata/e2e-extended/chase-1099-int-2025.pdf new file mode 100644 index 0000000..4cd7cc6 Binary files /dev/null and b/plugins/tax/testdata/e2e-extended/chase-1099-int-2025.pdf differ diff --git a/plugins/tax/testdata/e2e-extended/chase-1099-int-2025.pdf.mock.json b/plugins/tax/testdata/e2e-extended/chase-1099-int-2025.pdf.mock.json new file mode 100644 index 0000000..8777c12 --- /dev/null +++ b/plugins/tax/testdata/e2e-extended/chase-1099-int-2025.pdf.mock.json @@ -0,0 +1,9 @@ +{ + "payerOrIssuer": "Chase Bank", + "taxYear": 2025, + "confidence": "high", + "fields": { + "interest_income": { "value": 412.34, "sourcePage": 1 }, + "federal_withholding": { "value": 0, "sourcePage": 1 } + } +} diff --git a/plugins/tax/testdata/e2e-extended/fidelity-1099-b-2025.pdf b/plugins/tax/testdata/e2e-extended/fidelity-1099-b-2025.pdf new file mode 100644 index 0000000..fb76961 Binary files /dev/null and b/plugins/tax/testdata/e2e-extended/fidelity-1099-b-2025.pdf differ diff --git a/plugins/tax/testdata/e2e-extended/fidelity-1099-b-2025.pdf.mock.json b/plugins/tax/testdata/e2e-extended/fidelity-1099-b-2025.pdf.mock.json new file mode 100644 index 0000000..964cb1a --- /dev/null +++ b/plugins/tax/testdata/e2e-extended/fidelity-1099-b-2025.pdf.mock.json @@ -0,0 +1,13 @@ +{ + "payerOrIssuer": "Fidelity", + "taxYear": 2025, + "confidence": "high", + "fields": { + "short_term_covered_proceeds": { "value": 18500, "sourcePage": 1 }, + "short_term_covered_basis": { "value": 17900, "sourcePage": 1 }, + "short_term_wash_sale_adjustments": { "value": 120, "sourcePage": 1 }, + "long_term_covered_proceeds": { "value": 12000, "sourcePage": 1 }, + "long_term_covered_basis": { "value": 9400, "sourcePage": 1 }, + "long_term_wash_sale_adjustments": { "value": 0, "sourcePage": 1 } + } +} diff --git a/plugins/tax/testdata/e2e-extended/fidelity-1099-div-2025.pdf b/plugins/tax/testdata/e2e-extended/fidelity-1099-div-2025.pdf new file mode 100644 index 0000000..ee07d8d Binary files /dev/null and b/plugins/tax/testdata/e2e-extended/fidelity-1099-div-2025.pdf differ diff --git a/plugins/tax/testdata/e2e-extended/fidelity-1099-div-2025.pdf.mock.json b/plugins/tax/testdata/e2e-extended/fidelity-1099-div-2025.pdf.mock.json new file mode 100644 index 0000000..8fdebb9 --- /dev/null +++ b/plugins/tax/testdata/e2e-extended/fidelity-1099-div-2025.pdf.mock.json @@ -0,0 +1,10 @@ +{ + "payerOrIssuer": "Fidelity", + "taxYear": 2025, + "confidence": "high", + "fields": { + "ordinary_dividends": { "value": 1840.12, "sourcePage": 1 }, + "qualified_dividends": { "value": 1210.55, "sourcePage": 1 }, + "capital_gain_distributions": { "value": 50, "sourcePage": 1 } + } +} diff --git a/plugins/tax/testdata/e2e-extended/mortgage-1098-2025.pdf b/plugins/tax/testdata/e2e-extended/mortgage-1098-2025.pdf new file mode 100644 index 0000000..2418e21 Binary files /dev/null and b/plugins/tax/testdata/e2e-extended/mortgage-1098-2025.pdf differ diff --git a/plugins/tax/testdata/e2e-extended/mortgage-1098-2025.pdf.mock.json b/plugins/tax/testdata/e2e-extended/mortgage-1098-2025.pdf.mock.json new file mode 100644 index 0000000..0ce7e15 --- /dev/null +++ b/plugins/tax/testdata/e2e-extended/mortgage-1098-2025.pdf.mock.json @@ -0,0 +1,10 @@ +{ + "payerOrIssuer": "Home Lender", + "taxYear": 2025, + "confidence": "high", + "fields": { + "mortgage_interest_received": { "value": 14300, "sourcePage": 1 }, + "property_tax_paid": { "value": 6800, "sourcePage": 1 }, + "points_paid": { "value": 0, "sourcePage": 1 } + } +} diff --git a/plugins/tax/testdata/e2e-extended/profile.json b/plugins/tax/testdata/e2e-extended/profile.json new file mode 100644 index 0000000..6993311 --- /dev/null +++ b/plugins/tax/testdata/e2e-extended/profile.json @@ -0,0 +1,16 @@ +{ + "schemaVersion": "0.1.0", + "taxYear": 2025, + "filingStatus": "single", + "state": "CA", + "fullYearResident": true, + "dependents": 0, + "estimatedPayments": [], + "iraContributions": [], + "scenarioFlags": { + "rsu": false, + "espp": false, + "inheritedShares": false + }, + "reviewAnswers": {} +} diff --git a/plugins/tax/testdata/e2e-extended/w2-2025.pdf b/plugins/tax/testdata/e2e-extended/w2-2025.pdf new file mode 100644 index 0000000..bc6fdb3 Binary files /dev/null and b/plugins/tax/testdata/e2e-extended/w2-2025.pdf differ diff --git a/plugins/tax/testdata/e2e-extended/w2-2025.pdf.mock.json b/plugins/tax/testdata/e2e-extended/w2-2025.pdf.mock.json new file mode 100644 index 0000000..4851fb8 --- /dev/null +++ b/plugins/tax/testdata/e2e-extended/w2-2025.pdf.mock.json @@ -0,0 +1,11 @@ +{ + "payerOrIssuer": "Example Employer Inc.", + "taxYear": 2025, + "confidence": "high", + "fields": { + "box1_wages": { "value": 120000, "sourcePage": 1 }, + "box2_federal_withholding": { "value": 18000, "sourcePage": 1 }, + "state_ca_wages": { "value": 120000, "sourcePage": 1 }, + "state_ca_withholding": { "value": 8200, "sourcePage": 1 } + } +} diff --git a/plugins/tax/testdata/e2e-retirement/README.md b/plugins/tax/testdata/e2e-retirement/README.md new file mode 100644 index 0000000..fb30b29 --- /dev/null +++ b/plugins/tax/testdata/e2e-retirement/README.md @@ -0,0 +1,6 @@ +Fixture for retirement-focused end-to-end tests. + +Includes: +- W-2 without workplace retirement coverage +- 1099-R taxable distribution +- 5498 traditional IRA contribution diff --git a/plugins/tax/testdata/e2e-retirement/ira-1099-r-2025.pdf b/plugins/tax/testdata/e2e-retirement/ira-1099-r-2025.pdf new file mode 100644 index 0000000..fd9e3f2 Binary files /dev/null and b/plugins/tax/testdata/e2e-retirement/ira-1099-r-2025.pdf differ diff --git a/plugins/tax/testdata/e2e-retirement/ira-1099-r-2025.pdf.mock.json b/plugins/tax/testdata/e2e-retirement/ira-1099-r-2025.pdf.mock.json new file mode 100644 index 0000000..5a315f1 --- /dev/null +++ b/plugins/tax/testdata/e2e-retirement/ira-1099-r-2025.pdf.mock.json @@ -0,0 +1,12 @@ +{ + "payerOrIssuer": "Example Brokerage", + "taxYear": 2025, + "confidence": "high", + "fields": { + "gross_distribution": { "value": 5000, "sourcePage": 1 }, + "taxable_amount": { "value": 5000, "sourcePage": 1 }, + "federal_withholding": { "value": 500, "sourcePage": 1 }, + "state_withholding": { "value": 200, "sourcePage": 1 }, + "distribution_code": { "value": "7", "sourcePage": 1 } + } +} diff --git a/plugins/tax/testdata/e2e-retirement/ira-5498-2025.pdf b/plugins/tax/testdata/e2e-retirement/ira-5498-2025.pdf new file mode 100644 index 0000000..983dbad Binary files /dev/null and b/plugins/tax/testdata/e2e-retirement/ira-5498-2025.pdf differ diff --git a/plugins/tax/testdata/e2e-retirement/ira-5498-2025.pdf.mock.json b/plugins/tax/testdata/e2e-retirement/ira-5498-2025.pdf.mock.json new file mode 100644 index 0000000..90f6b13 --- /dev/null +++ b/plugins/tax/testdata/e2e-retirement/ira-5498-2025.pdf.mock.json @@ -0,0 +1,9 @@ +{ + "payerOrIssuer": "Example Brokerage", + "taxYear": 2025, + "confidence": "high", + "fields": { + "account_type": { "value": "traditional_ira", "sourcePage": 1 }, + "contribution_amount": { "value": 7000, "sourcePage": 1 } + } +} diff --git a/plugins/tax/testdata/e2e-retirement/profile.json b/plugins/tax/testdata/e2e-retirement/profile.json new file mode 100644 index 0000000..6993311 --- /dev/null +++ b/plugins/tax/testdata/e2e-retirement/profile.json @@ -0,0 +1,16 @@ +{ + "schemaVersion": "0.1.0", + "taxYear": 2025, + "filingStatus": "single", + "state": "CA", + "fullYearResident": true, + "dependents": 0, + "estimatedPayments": [], + "iraContributions": [], + "scenarioFlags": { + "rsu": false, + "espp": false, + "inheritedShares": false + }, + "reviewAnswers": {} +} diff --git a/plugins/tax/testdata/e2e-retirement/w2-2025.pdf b/plugins/tax/testdata/e2e-retirement/w2-2025.pdf new file mode 100644 index 0000000..bc6fdb3 Binary files /dev/null and b/plugins/tax/testdata/e2e-retirement/w2-2025.pdf differ diff --git a/plugins/tax/testdata/e2e-retirement/w2-2025.pdf.mock.json b/plugins/tax/testdata/e2e-retirement/w2-2025.pdf.mock.json new file mode 100644 index 0000000..6a01b97 --- /dev/null +++ b/plugins/tax/testdata/e2e-retirement/w2-2025.pdf.mock.json @@ -0,0 +1,11 @@ +{ + "payerOrIssuer": "Example Employer Inc.", + "taxYear": 2025, + "confidence": "high", + "fields": { + "box1_wages": { "value": 90000, "sourcePage": 1 }, + "box2_federal_withholding": { "value": 12000, "sourcePage": 1 }, + "state_ca_wages": { "value": 90000, "sourcePage": 1 }, + "state_ca_withholding": { "value": 5200, "sourcePage": 1 } + } +} diff --git a/plugins/tax/testdata/e2e-tax-year-mismatch/README.md b/plugins/tax/testdata/e2e-tax-year-mismatch/README.md new file mode 100644 index 0000000..a7d5804 --- /dev/null +++ b/plugins/tax/testdata/e2e-tax-year-mismatch/README.md @@ -0,0 +1 @@ +Fixture that proves extracted tax-year mismatches block the run. diff --git a/plugins/tax/testdata/e2e-tax-year-mismatch/chase-1099-int-2025.pdf b/plugins/tax/testdata/e2e-tax-year-mismatch/chase-1099-int-2025.pdf new file mode 100644 index 0000000..eabad09 Binary files /dev/null and b/plugins/tax/testdata/e2e-tax-year-mismatch/chase-1099-int-2025.pdf differ diff --git a/plugins/tax/testdata/e2e-tax-year-mismatch/chase-1099-int-2025.pdf.mock.json b/plugins/tax/testdata/e2e-tax-year-mismatch/chase-1099-int-2025.pdf.mock.json new file mode 100644 index 0000000..de09ed4 --- /dev/null +++ b/plugins/tax/testdata/e2e-tax-year-mismatch/chase-1099-int-2025.pdf.mock.json @@ -0,0 +1,15 @@ +{ + "payerOrIssuer": "Chase Bank", + "taxYear": 2024, + "confidence": "high", + "fields": { + "interest_income": { + "value": 450, + "sourcePage": 1 + }, + "federal_withholding": { + "value": 0, + "sourcePage": 1 + } + } +} diff --git a/plugins/tax/testdata/e2e-tax-year-mismatch/profile.json b/plugins/tax/testdata/e2e-tax-year-mismatch/profile.json new file mode 100644 index 0000000..6993311 --- /dev/null +++ b/plugins/tax/testdata/e2e-tax-year-mismatch/profile.json @@ -0,0 +1,16 @@ +{ + "schemaVersion": "0.1.0", + "taxYear": 2025, + "filingStatus": "single", + "state": "CA", + "fullYearResident": true, + "dependents": 0, + "estimatedPayments": [], + "iraContributions": [], + "scenarioFlags": { + "rsu": false, + "espp": false, + "inheritedShares": false + }, + "reviewAnswers": {} +} diff --git a/plugins/tax/testdata/e2e-tax-year-mismatch/w2-2025.pdf b/plugins/tax/testdata/e2e-tax-year-mismatch/w2-2025.pdf new file mode 100644 index 0000000..a204a14 Binary files /dev/null and b/plugins/tax/testdata/e2e-tax-year-mismatch/w2-2025.pdf differ diff --git a/plugins/tax/testdata/e2e-tax-year-mismatch/w2-2025.pdf.mock.json b/plugins/tax/testdata/e2e-tax-year-mismatch/w2-2025.pdf.mock.json new file mode 100644 index 0000000..d1e70e1 --- /dev/null +++ b/plugins/tax/testdata/e2e-tax-year-mismatch/w2-2025.pdf.mock.json @@ -0,0 +1,27 @@ +{ + "payerOrIssuer": "Example Employer Inc.", + "taxYear": 2025, + "confidence": "high", + "fields": { + "box1_wages": { + "value": 120000, + "sourcePage": 1 + }, + "box2_federal_withholding": { + "value": 18000, + "sourcePage": 1 + }, + "box12_code_d": { + "value": 23000, + "sourcePage": 1 + }, + "state_ca_wages": { + "value": 120000, + "sourcePage": 1 + }, + "state_ca_withholding": { + "value": 8200, + "sourcePage": 1 + } + } +} diff --git a/plugins/tax/tsconfig.json b/plugins/tax/tsconfig.json new file mode 100644 index 0000000..7b652fa --- /dev/null +++ b/plugins/tax/tsconfig.json @@ -0,0 +1,17 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "ESNext", + "moduleResolution": "bundler", + "lib": ["ES2022"], + "outDir": "dist", + "rootDir": "src", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "declaration": true, + "resolveJsonModule": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +} diff --git a/src/crabcode b/src/crabcode index 37d5305..ccac8b6 100755 --- a/src/crabcode +++ b/src/crabcode @@ -7576,6 +7576,117 @@ pf_help() { echo " ANTHROPIC_API_KEY Anthropic API key" } +# ============================================================================= +# Tax Plugin Commands +# ============================================================================= + +handle_tax_command() { + local PLUGIN_DIR="$CONFIG_DIR/plugins/tax" + + case "${1:-}" in + "install") + tax_install + ;; + "uninstall"|"remove") + tax_uninstall + ;; + "update"|"upgrade") + tax_uninstall + tax_install + ;; + "help"|"-h"|"--help"|"") + tax_help + ;; + *) + if [ ! -d "$PLUGIN_DIR" ] || [ ! -f "$PLUGIN_DIR/dist/cli.js" ]; then + echo -e "${CYAN}Crab Tax - Tax document organizer${NC}" + echo "" + echo "Plugin not installed. Install with:" + echo "" + echo -e " ${GREEN}crab tax install${NC}" + echo "" + return 1 + fi + node "$PLUGIN_DIR/dist/cli.js" "$@" + ;; + esac +} + +tax_install() { + local PLUGIN_DIR="$CONFIG_DIR/plugins/tax" + local REPO_URL="https://github.com/promptfoo/crabcode.git" + local BRANCH="main" + + echo -e "${CYAN}Installing Crab Tax...${NC}" + + if ! command_exists node; then + error "Node.js is required. Install with: brew install node" + return 1 + fi + + mkdir -p "$CONFIG_DIR/plugins" + + local TEMP_DIR=$(mktemp -d) + echo "Cloning plugin from $REPO_URL..." + + git clone --depth 1 --branch "$BRANCH" --filter=blob:none --sparse "$REPO_URL" "$TEMP_DIR" 2>/dev/null + cd "$TEMP_DIR" + git sparse-checkout set plugins/tax + + rm -rf "$PLUGIN_DIR" + mv plugins/tax "$PLUGIN_DIR" + cd - >/dev/null + rm -rf "$TEMP_DIR" + + echo "Installing dependencies..." + cd "$PLUGIN_DIR" + npm install --silent + npm run build --silent + cd - >/dev/null + + success "✓ Installed to $PLUGIN_DIR" + echo "" + echo "Usage:" + echo -e " ${GREEN}crab tax ./my-tax-docs${NC} # Inventory tax documents" + echo -e " ${GREEN}crab tax ./my-tax-docs --output ./out${NC} # Write outputs to a directory" + echo -e " ${GREEN}crab tax --help${NC} # Show all options" +} + +tax_uninstall() { + local PLUGIN_DIR="$CONFIG_DIR/plugins/tax" + + if [ -d "$PLUGIN_DIR" ]; then + rm -rf "$PLUGIN_DIR" + success "✓ Uninstalled tax plugin" + else + echo "Plugin not installed" + fi +} + +tax_help() { + echo -e "${CYAN}Crab Tax - Tax document organizer${NC}" + echo "" + echo "Organize tax documents and generate filing handoff outputs." + echo "" + echo -e "${BOLD}Installation:${NC}" + echo " crab tax install Install the plugin" + echo " crab tax uninstall Remove the plugin" + echo " crab tax update Update to latest version" + echo "" + echo -e "${BOLD}Usage:${NC}" + echo " crab tax Process a folder of tax documents" + echo " crab tax --output ./out" + echo " crab tax --profile ./profile.json" + echo "" + echo -e "${BOLD}Options:${NC}" + echo " --output Output directory" + echo " --profile Taxpayer profile JSON" + echo " --tax-year Tax year (default: 2025)" + echo " --filing-status Filing status" + echo " --preview Allow preview outputs" + echo " --verbose Show additional output" +} + # ============================================================================= # Draw Plugin Commands # ============================================================================= @@ -9617,6 +9728,10 @@ main() { # Promptfoo target discovery agent handle_pf_command "${@:2}" ;; + "tax") + # Tax document organizer plugin + handle_tax_command "${@:2}" + ;; "draw") # Collaborative Excalidraw sessions handle_draw_command "${@:2}"