microsoft · gargsaumya · May 19, 2026 · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/.gitignore b/.gitignore
@@ -48,6 +48,16 @@ build/
 
 # wheel files
 *.whl
+
+# Coverage reports and artifacts
+.coverage
+coverage.json
+coverage*.xml
+htmlcov/
+unified-coverage/
+*.profraw
+*.profdata
+*.info
 *.tar.gz
 *.zip
 
@@ -66,3 +76,11 @@ mssql_py_core/
 
 # learning files
 learnings/
+
+# Local development and experimental scripts (not part of the PR)
+add_platform_exclusions.py
+add_lcov_exclusions.py
+fix_multiline_log_exclusions.py
+test_pyodbc_decimal.py
+run_coverage_docker.ps1
+TRIAGE_REPORT_*.md
diff --git a/eng/scripts/join_logs_for_coverage.py b/eng/scripts/join_logs_for_coverage.py
@@ -0,0 +1,197 @@
+#!/usr/bin/env python3
+"""
+Join multi-line LOG() calls onto single lines for LCOV coverage filtering.
+
+This script is used only during coverage builds to simplify LOG statement exclusion.
+It doesn't modify the original source files - it works on copies during the build.
+Adjacent string literals are concatenated at compile time, so runtime behavior is identical.
+
+Uses a proper C++ tokenizer to handle string literals, character literals, and comments
+correctly, avoiding issues with unbalanced parentheses or semicolons in strings.
+"""
+
+import re
+import sys
+from pathlib import Path
+
+
+_LOG_MACRO_PATTERN = re.compile(r'\bLOG[A-Z_]*\s*\(')
+
+
+def _find_log_macro_open(line: str):
+    """Return the index of the opening parenthesis for a LOG-like macro, if present."""
+    match = _LOG_MACRO_PATTERN.search(line)
+    if not match:
+        return None
+    return match.end() - 1
+
+
+def _find_log_statement_end(lines, start_line, open_paren_index):
+    """Find the line index where the LOG macro call closes, ignoring literals/comments.
+
+    This properly handles:
+    - String literals: LOG("unbalanced (", x);
+    - Character literals: LOG(')', code);  
+    - Line comments: LOG("msg", x);  // comment with )
+    - Block comments: LOG("msg" /* comment ) */, x);
+    """
+    depth = 0
+    in_string = False
+    in_char = False
+    in_block_comment = False
+    escape = False
+
+    for line_index in range(start_line, len(lines)):
+        line = lines[line_index]
+        i = open_paren_index if line_index == start_line else 0
+        in_line_comment = False
+
+        while i < len(line):
+            ch = line[i]
+            nxt = line[i + 1] if i + 1 < len(line) else ''
+
+            # Line comments consume rest of line
+            if in_line_comment:
+                break
+
+            # Inside block comment - only look for */
+            if in_block_comment:
+                if ch == '*' and nxt == '/':
+                    in_block_comment = False
+                    i += 2
+                    continue
+                i += 1
+                continue
+
+            # Inside string literal - handle escapes
+            if in_string:
+                if escape:
+                    escape = False
+                elif ch == '\\':
+                    escape = True
+                elif ch == '"':
+                    in_string = False
+                i += 1
+                continue
+
+            # Inside character literal - handle escapes
+            if in_char:
+                if escape:
+                    escape = False
+                elif ch == '\\':
+                    escape = True
+                elif ch == "'":
+                    in_char = False
+                i += 1
+                continue
+
+            # Check for comment starts
+            if ch == '/' and nxt == '/':
+                in_line_comment = True
+                break
+            if ch == '/' and nxt == '*':
+                in_block_comment = True
+                i += 2
+                continue
+
+            # Check for literal starts
+            if ch == '"':
+                in_string = True
+                escape = False
+                i += 1
+                continue
+            if ch == "'":
+                in_char = True
+                escape = False
+                i += 1
+                continue
+
+            # Count parentheses depth outside of literals/comments
+            if ch == '(':
+                depth += 1
+            elif ch == ')':
+                depth -= 1
+                if depth == 0:
+                    return line_index
+
+            i += 1
+
+    return None
+
+
+def join_log_statements(content: str) -> str:
+    """Join multi-line LOG macro calls onto a single line using proper C++ tokenization."""
+    lines = content.split('\n')
+    result = []
+    i = 0
+
+    while i < len(lines):
+        line = lines[i]
+
+        # Check if this line contains a LOG macro start
+        open_paren_index = _find_log_macro_open(line)
+        if open_paren_index is not None:
+            # Find where the LOG statement ends, respecting C++ syntax
+            end_index = _find_log_statement_end(lines, i, open_paren_index)
+            if end_index is not None and end_index > i:
+                # Multi-line LOG statement found - join it
+                full_statement = lines[i]
+                for join_index in range(i + 1, end_index + 1):
+                    full_statement += ' ' + lines[join_index].strip()
+                result.append(full_statement)
+                i = end_index + 1
+                continue
+
+        # Not a LOG statement or single-line LOG - keep as is
+        result.append(line)
+        i += 1
+
+    return '\n'.join(result)
+
+
+def process_file(filepath: Path) -> None:
+    """Process a single C++ source file."""
+    try:
+        with open(filepath, 'r', encoding='utf-8') as f:
+            content = f.read()
+
+        modified = join_log_statements(content)
+
+        with open(filepath, 'w', encoding='utf-8') as f:
+            f.write(modified)
+
+        print(f"[INFO] Processed: {filepath}")
+    except Exception as e:
+        print(f"[ERROR] Failed to process {filepath}: {e}", file=sys.stderr)
+        sys.exit(1)
+
+
+def main():
+    """Process all .cpp and .hpp files in the pybind directory."""
+    if len(sys.argv) > 1:
+        # Process specific directory passed as argument
+        base_dir = Path(sys.argv[1])
+    else:
+        # Default to current directory
+        base_dir = Path.cwd()
+
+    if not base_dir.exists():
+        print(f"[ERROR] Directory not found: {base_dir}", file=sys.stderr)
+        sys.exit(1)
+
+    # Find all C++ source files
+    cpp_files = list(base_dir.rglob('*.cpp')) + list(base_dir.rglob('*.hpp'))
+
+    if not cpp_files:
+        print(f"[WARNING] No .cpp or .hpp files found in {base_dir}")
+        return
+
+    print(f"[INFO] Processing {len(cpp_files)} C++ files in {base_dir}")
+    for filepath in cpp_files:
+        process_file(filepath)
+
+    print(f"[SUCCESS] Joined LOG statements in {len(cpp_files)} files")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/generate_codecov.sh b/generate_codecov.sh
@@ -74,36 +74,61 @@ fi
 
 echo "[INFO] Using pybind module: $PYBIND_SO"
 
-# Export C++ coverage, excluding Python headers, pybind11, and system includes
+# Export C++ coverage, excluding Python headers, pybind11, system includes, and vendored deps
 llvm-cov export "$PYBIND_SO" \
   -instr-profile=default.profdata \
-  -ignore-filename-regex='(python3\.[0-9]+|cpython|pybind11|/usr/include/|/usr/lib/)' \
+  -ignore-filename-regex='(python3\.[0-9]+|cpython|pybind11|/usr/include/|/usr/lib/|build/_deps/)' \
   --skip-functions \
   -format=lcov > cpp-coverage.info
 
-# Note: LCOV exclusion markers (LCOV_EXCL_LINE) should be added to source code
-# to exclude LOG() statements from coverage. However, for automated exclusion
-# of all LOG lines without modifying source code, we can use geninfo's --omit-lines
-# feature during the merge step (see below).
+# Note: LCOV exclusion markers (LCOV_EXCL_LINE) are processed below
 
 echo "==================================="
 echo "[STEP 4] Merging Python + C++ coverage"
 echo "==================================="
 
-# Merge LCOV reports (ignore inconsistencies in Python LCOV export)
-echo "[ACTION] Merging Python and C++ coverage"
-lcov -a python-coverage.info -a cpp-coverage.info -o total.info \
+# Merge LCOV reports and filter LOG statements using lcov's built-in exclusion
+# The --rc option sets lcov_excl_line to match any line containing LOG macros
+# Since we joined multi-line LOGs during build, they're now on single lines
+echo "[ACTION] Merging Python and C++ coverage with LOG exclusion"
+lcov -a python-coverage.info -a cpp-coverage.info -o total-unfiltered.info \
+  --rc lcov_excl_line='\bLOG[A-Z_]*\s*\(' \
   --ignore-errors inconsistent,corrupt
 
+echo "[INFO] Coverage merged with LOG statements excluded"
+
+# Defense-in-depth: drop any vendored third-party sources pulled in via CMake
+# FetchContent (e.g. simdutf). The llvm-cov ignore-filename-regex above is the
+# primary filter; this catches anything that slips through future deps.
+echo "[ACTION] Removing vendored third-party sources from merged coverage"
+lcov --remove total-unfiltered.info '*/build/_deps/*' -o total.info \
+  --ignore-errors inconsistent,unused
+
 # Normalize paths so everything starts from mssql_python/
 echo "[ACTION] Normalizing paths in LCOV report"
 sed -i "s|$(pwd)/||g" total.info
 
 # Generate full HTML report
+echo "[ACTION] Generating HTML coverage report"
 genhtml total.info \
   --output-directory unified-coverage \
   --quiet \
   --title "Unified Coverage Report"
 
 # Generate Cobertura XML (for Azure DevOps Code Coverage tab)
 lcov_cobertura total.info --output coverage.xml
+
+echo "==================================="
+echo "[STEP 5] Cleanup"
+echo "==================================="
+
+# Restore original source files if they were backed up during coverage build
+BACKUP_FILE="mssql_python/pybind/.source_backup_coverage.tar.gz"
+if [ -f "$BACKUP_FILE" ]; then
+    echo "[ACTION] Restoring original source files from backup"
+    (cd mssql_python/pybind && tar -xzf .source_backup_coverage.tar.gz)
+    rm -f "$BACKUP_FILE"
+    echo "[INFO] Original source files restored"
+fi
+
+echo "[INFO] Coverage report generation complete"
diff --git a/mssql_python/pybind/build.sh b/mssql_python/pybind/build.sh
@@ -31,6 +31,45 @@ COVERAGE_MODE=false
 if [[ "${1:-}" == "codecov" || "${1:-}" == "--coverage" ]]; then
     COVERAGE_MODE=true
     echo "[MODE] Enabling Clang coverage instrumentation"
+
+    # For coverage builds, join multi-line LOG statements to simplify LCOV filtering
+    # Original source is backed up and must be restored by generate_codecov.sh after analysis
+    echo "[ACTION] Preparing source for coverage build (joining LOG statements)"
+
+    # Save current directory
+    ORIGINAL_DIR=$(pwd)
+
+    # Create backup using tar to preserve directory structure
+    BACKUP_FILE="${ORIGINAL_DIR}/.source_backup_coverage.tar.gz"
+    echo "[INFO] Creating backup of source files"
+    tar -czf "$BACKUP_FILE" --exclude='build' --exclude='.source_backup*' \
+        $(find . -maxdepth 2 -type f \( -name "*.cpp" -o -name "*.hpp" \) -o -type d -name connection) 2>/dev/null || true
+
+    if [[ ! -f "$BACKUP_FILE" ]]; then
+        echo "[ERROR] Failed to create source backup"
+        exit 1
+    fi
+
+    # Join LOG statements using the helper script
+    SCRIPT_PATH="${ORIGINAL_DIR}/../../eng/scripts/join_logs_for_coverage.py"
+    if [[ -f "$SCRIPT_PATH" ]]; then
+        python3 "$SCRIPT_PATH" "$ORIGINAL_DIR"
+        if [[ $? -eq 0 ]]; then
+            echo "[SUCCESS] LOG statements joined for coverage build"
+            echo "[INFO] Original source backed up to $BACKUP_FILE"
+            echo "[IMPORTANT] Run 'tar -xzf $BACKUP_FILE' in $(pwd) to restore after coverage analysis"
+        else
+            echo "[ERROR] Failed to join LOG statements"
+            # Restore backup and exit
+            tar -xzf "$BACKUP_FILE" 2>/dev/null
+            rm -f "$BACKUP_FILE"
+            exit 1
+        fi
+    else
+        echo "[WARNING] join_logs_for_coverage.py not found at $SCRIPT_PATH"
+        echo "[WARNING] Continuing with original source (LOG filtering may be incomplete)"
+        rm -f "$BACKUP_FILE"  # No need for backup if not joining
+    fi
 fi
 
 # Get Python version from active interpreter