diff --git a/.github/workflows/python-integration-tests.yml b/.github/workflows/python-integration-tests.yml index 32c25efdbf..8c74f21130 100644 --- a/.github/workflows/python-integration-tests.yml +++ b/.github/workflows/python-integration-tests.yml @@ -157,6 +157,8 @@ jobs: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} ANTHROPIC_CHAT_MODEL: ${{ vars.ANTHROPIC_CHAT_MODEL_ID }} LOCAL_MCP_URL: ${{ vars.LOCAL_MCP__URL }} + OLLAMA_MODEL: qwen2.5:1.5b + OLLAMA_EMBEDDING_MODEL: nomic-embed-text defaults: run: working-directory: python @@ -171,6 +173,37 @@ jobs: with: python-version: ${{ env.UV_PYTHON }} os: ${{ runner.os }} + - name: Install Ollama + run: curl -fsSL https://ollama.com/install.sh | sh + working-directory: . + - name: Cache Ollama models + uses: actions/cache@v4 + with: + path: ~/.ollama/models + key: ollama-models-qwen2.5-1.5b-nomic-embed-text-v1 + - name: Start Ollama and pull models + run: | + # Stop any Ollama instance auto-started by the install script + pkill ollama || true + sleep 2 + ollama serve & + for i in $(seq 1 30); do + if curl -sf http://localhost:11434/api/tags > /dev/null 2>&1; then + break + fi + sleep 1 + done + # Pull models with retry for transient 429 rate limits + for model in qwen2.5:1.5b nomic-embed-text; do + for attempt in 1 2 3; do + if ollama pull "$model"; then + break + fi + echo "Retry $attempt for $model (waiting 15s)..." + sleep 15 + done + done + working-directory: . - name: Start local MCP server id: local-mcp uses: ./.github/actions/setup-local-mcp-server @@ -271,7 +304,7 @@ jobs: -m integration -n logical --dist worksteal -x - --timeout=360 --session-timeout=900 --timeout_method thread + --timeout=480 --session-timeout=900 --timeout_method thread --retries 2 --retry-delay 5 --junitxml=pytest.xml - name: Upload test results @@ -435,9 +468,9 @@ jobs: path: ./python/pytest.xml if-no-files-found: ignore - # Flaky test trend report (aggregates per-job JUnit XML results) - python-flaky-test-report: - name: Flaky Test Report + # Integration test trend report (aggregates per-job JUnit XML results) + python-integration-test-report: + name: Integration Test Report if: > always() && (contains(join(needs.*.result, ','), 'success') || @@ -471,36 +504,36 @@ jobs: with: pattern: test-results-* path: test-results/ - - name: Restore flaky report history cache + - name: Restore report history cache uses: actions/cache/restore@v4 with: - path: python/flaky-report-history.json - key: flaky-report-history-integration-${{ github.run_id }} + path: python/integration-report-history.json + key: integration-report-history-integration-${{ github.run_id }} restore-keys: | - flaky-report-history-integration- + integration-report-history-integration- - name: Generate trend report run: > uv run python scripts/flaky_report/aggregate.py ../test-results/ - flaky-report-history.json - flaky-test-report.md + integration-report-history.json + integration-test-report.md - name: Post to Job Summary if: always() - run: cat flaky-test-report.md >> $GITHUB_STEP_SUMMARY - - name: Save flaky report history cache + run: cat integration-test-report.md >> $GITHUB_STEP_SUMMARY + - name: Save report history cache if: always() uses: actions/cache/save@v4 with: - path: python/flaky-report-history.json - key: flaky-report-history-integration-${{ github.run_id }} + path: python/integration-report-history.json + key: integration-report-history-integration-${{ github.run_id }} - name: Upload unified trend report if: always() uses: actions/upload-artifact@v7 with: - name: flaky-test-report + name: integration-test-report path: | - python/flaky-test-report.md - python/flaky-report-history.json + python/integration-test-report.md + python/integration-report-history.json python-integration-tests-check: if: always() diff --git a/.github/workflows/python-merge-tests.yml b/.github/workflows/python-merge-tests.yml index 9529e54d97..1ad5019951 100644 --- a/.github/workflows/python-merge-tests.yml +++ b/.github/workflows/python-merge-tests.yml @@ -278,6 +278,8 @@ jobs: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} ANTHROPIC_CHAT_MODEL: ${{ vars.ANTHROPIC_CHAT_MODEL_ID }} LOCAL_MCP_URL: ${{ vars.LOCAL_MCP__URL }} + OLLAMA_MODEL: qwen2.5:1.5b + OLLAMA_EMBEDDING_MODEL: nomic-embed-text defaults: run: working-directory: python @@ -289,6 +291,37 @@ jobs: with: python-version: ${{ env.UV_PYTHON }} os: ${{ runner.os }} + - name: Install Ollama + run: curl -fsSL https://ollama.com/install.sh | sh + working-directory: . + - name: Cache Ollama models + uses: actions/cache@v4 + with: + path: ~/.ollama/models + key: ollama-models-qwen2.5-1.5b-nomic-embed-text-v1 + - name: Start Ollama and pull models + run: | + # Stop any Ollama instance auto-started by the install script + pkill ollama || true + sleep 2 + ollama serve & + for i in $(seq 1 30); do + if curl -sf http://localhost:11434/api/tags > /dev/null 2>&1; then + break + fi + sleep 1 + done + # Pull models with retry for transient 429 rate limits + for model in qwen2.5:1.5b nomic-embed-text; do + for attempt in 1 2 3; do + if ollama pull "$model"; then + break + fi + echo "Retry $attempt for $model (waiting 15s)..." + sleep 15 + done + done + working-directory: . - name: Start local MCP server id: local-mcp uses: ./.github/actions/setup-local-mcp-server @@ -403,7 +436,7 @@ jobs: -m integration -n logical --dist worksteal -x - --timeout=360 --session-timeout=900 --timeout_method thread + --timeout=480 --session-timeout=900 --timeout_method thread --retries 2 --retry-delay 5 --junitxml=pytest.xml working-directory: ./python @@ -619,9 +652,9 @@ jobs: path: ./python/pytest.xml if-no-files-found: ignore - # Flaky test trend report (aggregates per-job JUnit XML results) - python-flaky-test-report: - name: Flaky Test Report + # Integration test trend report (aggregates per-job JUnit XML results) + python-integration-test-report: + name: Integration Test Report if: > always() && (contains(join(needs.*.result, ','), 'success') || @@ -652,36 +685,36 @@ jobs: with: pattern: test-results-* path: test-results/ - - name: Restore flaky report history cache + - name: Restore report history cache uses: actions/cache/restore@v4 with: - path: python/flaky-report-history.json - key: flaky-report-history-merge-${{ github.run_id }} + path: python/integration-report-history.json + key: integration-report-history-merge-${{ github.run_id }} restore-keys: | - flaky-report-history-merge- + integration-report-history-merge- - name: Generate trend report run: > uv run python scripts/flaky_report/aggregate.py ../test-results/ - flaky-report-history.json - flaky-test-report.md + integration-report-history.json + integration-test-report.md - name: Post to Job Summary if: always() - run: cat flaky-test-report.md >> $GITHUB_STEP_SUMMARY - - name: Save flaky report history cache + run: cat integration-test-report.md >> $GITHUB_STEP_SUMMARY + - name: Save report history cache if: always() uses: actions/cache/save@v4 with: - path: python/flaky-report-history.json - key: flaky-report-history-merge-${{ github.run_id }} + path: python/integration-report-history.json + key: integration-report-history-merge-${{ github.run_id }} - name: Upload unified trend report if: always() uses: actions/upload-artifact@v7 with: - name: flaky-test-report + name: integration-test-report path: | - python/flaky-test-report.md - python/flaky-report-history.json + python/integration-test-report.md + python/integration-report-history.json python-integration-tests-check: if: always() diff --git a/python/packages/azurefunctions/tests/integration_tests/test_03_reliable_streaming.py b/python/packages/azurefunctions/tests/integration_tests/test_03_reliable_streaming.py index 23c58a2a95..c99198b602 100644 --- a/python/packages/azurefunctions/tests/integration_tests/test_03_reliable_streaming.py +++ b/python/packages/azurefunctions/tests/integration_tests/test_03_reliable_streaming.py @@ -26,7 +26,6 @@ pytest.mark.integration, pytest.mark.sample("03_reliable_streaming"), pytest.mark.usefixtures("function_app_for_test"), - pytest.mark.skip(reason="Temp disabled to fix test instability - needs investigation into root cause"), ] diff --git a/python/packages/azurefunctions/tests/integration_tests/test_11_workflow_parallel.py b/python/packages/azurefunctions/tests/integration_tests/test_11_workflow_parallel.py index bc9fa59bca..65a96678a1 100644 --- a/python/packages/azurefunctions/tests/integration_tests/test_11_workflow_parallel.py +++ b/python/packages/azurefunctions/tests/integration_tests/test_11_workflow_parallel.py @@ -42,7 +42,7 @@ def _setup(self, base_url: str, sample_helper) -> None: self.base_url = base_url self.helper = sample_helper - @pytest.mark.skip(reason="Causes timeouts.") + @pytest.mark.skip(reason="xdist distributes module tests across workers, each spawning a func process") def test_parallel_workflow_document_analysis(self) -> None: """Test parallel workflow with a standard document.""" payload = { @@ -71,7 +71,7 @@ def test_parallel_workflow_document_analysis(self) -> None: assert status["runtimeStatus"] == "Completed" assert "output" in status - @pytest.mark.skip(reason="Causes timeouts.") + @pytest.mark.skip(reason="xdist distributes module tests across workers, each spawning a func process") def test_parallel_workflow_short_document(self) -> None: """Test parallel workflow with a short document.""" payload = { @@ -91,7 +91,7 @@ def test_parallel_workflow_short_document(self) -> None: assert status["runtimeStatus"] == "Completed" assert "output" in status - @pytest.mark.skip(reason="Causes timeouts.") + @pytest.mark.skip(reason="xdist distributes module tests across workers, each spawning a func process") def test_parallel_workflow_technical_document(self) -> None: """Test parallel workflow with a technical document.""" payload = { @@ -115,7 +115,7 @@ def test_parallel_workflow_technical_document(self) -> None: status = self.helper.wait_for_orchestration_with_output(data["statusQueryGetUri"], max_wait=300) assert status["runtimeStatus"] == "Completed" - @pytest.mark.skip(reason="Causes timeouts.") + @pytest.mark.skip(reason="xdist distributes module tests across workers, each spawning a func process") def test_workflow_status_endpoint(self) -> None: """Test that the workflow status endpoint works correctly.""" payload = { diff --git a/python/packages/durabletask/tests/integration_tests/test_06_dt_multi_agent_orchestration_conditionals.py b/python/packages/durabletask/tests/integration_tests/test_06_dt_multi_agent_orchestration_conditionals.py index d20c67e20f..177f4ca5f4 100644 --- a/python/packages/durabletask/tests/integration_tests/test_06_dt_multi_agent_orchestration_conditionals.py +++ b/python/packages/durabletask/tests/integration_tests/test_06_dt_multi_agent_orchestration_conditionals.py @@ -52,7 +52,6 @@ def test_agents_registered(self): assert email_agent is not None assert email_agent.name == EMAIL_AGENT_NAME - @pytest.mark.skip(reason="Consistently fails due to orchestration timeouts - needs investigation") def test_conditional_branching(self): """Test that conditional branching works correctly.""" # Test with obvious spam diff --git a/python/packages/foundry/tests/foundry/test_foundry_agent.py b/python/packages/foundry/tests/foundry/test_foundry_agent.py index e110e540fe..ff056d8422 100644 --- a/python/packages/foundry/tests/foundry/test_foundry_agent.py +++ b/python/packages/foundry/tests/foundry/test_foundry_agent.py @@ -634,7 +634,6 @@ def _import_with_missing_azure_monitor( @pytest.mark.flaky @pytest.mark.integration @skip_if_foundry_agent_integration_tests_disabled -@pytest.mark.skip(reason="Test agent seems to have disappeared from the test environment; needs investigation.") async def test_foundry_agent_basic_run() -> None: """Smoke-test FoundryAgent against a real configured agent.""" async with FoundryAgent(credential=AzureCliCredential(), allow_preview=True) as agent: @@ -648,10 +647,11 @@ async def test_foundry_agent_basic_run() -> None: @pytest.mark.flaky @pytest.mark.integration @skip_if_foundry_agent_integration_tests_disabled -@pytest.mark.skip(reason="Test agent seems to have disappeared from the test environment; needs investigation.") async def test_foundry_agent_custom_client_run() -> None: """Smoke-test FoundryAgent against a real configured agent.""" - async with FoundryAgent(credential=AzureCliCredential(), client_type=RawFoundryAgentChatClient) as agent: + async with FoundryAgent( + credential=AzureCliCredential(), client_type=RawFoundryAgentChatClient, allow_preview=True + ) as agent: response = await agent.run("Please respond with exactly: 'This is a response test.'") assert isinstance(response, AgentResponse) diff --git a/python/packages/foundry_hosting/tests/test_responses_int.py b/python/packages/foundry_hosting/tests/test_responses_int.py index 24c590f25c..a67b86f00a 100644 --- a/python/packages/foundry_hosting/tests/test_responses_int.py +++ b/python/packages/foundry_hosting/tests/test_responses_int.py @@ -559,25 +559,21 @@ async def test_tool_call_streaming(self, server_with_tools: ResponsesHostServer) class TestOptions: """Verify chat options are passed through to the model.""" - @pytest.mark.skip(reason="Flaky in merge queue, blocking unrelated PRs. Tracked in #5553.") @pytest.mark.flaky @pytest.mark.integration @skip_if_foundry_hosting_integration_tests_disabled async def test_temperature_and_max_tokens(self, server: ResponsesHostServer) -> None: - """Set temperature and max_output_tokens and verify the response succeeds.""" + """Set max_output_tokens and verify the response succeeds.""" resp = await _post_json( server, { "input": "Say hello briefly.", "stream": False, - "max_output_tokens": 50, + "max_output_tokens": 200, }, ) assert resp.status_code == 200 body = resp.json() assert body["status"] == "completed" - output_messages = [o for o in body["output"] if o["type"] == "message"] - assert len(output_messages) == 1 - output_text = output_messages[0]["content"][0]["text"] - assert len(output_text) > 0 + assert len(body["output"]) > 0 diff --git a/python/packages/openai/tests/openai/test_openai_chat_client.py b/python/packages/openai/tests/openai/test_openai_chat_client.py index cb4e1b5895..5c9b2b0438 100644 --- a/python/packages/openai/tests/openai/test_openai_chat_client.py +++ b/python/packages/openai/tests/openai/test_openai_chat_client.py @@ -1,5 +1,6 @@ # Copyright (c) Microsoft. All rights reserved. +import asyncio import base64 import inspect import json @@ -36,6 +37,7 @@ ChatClientInvalidRequestException, SettingNotFoundError, ) +from dotenv import load_dotenv from openai import BadRequestError from openai.types.responses.response_reasoning_item import Summary from openai.types.responses.response_reasoning_summary_text_delta_event import ( @@ -58,6 +60,8 @@ from agent_framework_openai._chat_client import OPENAI_LOCAL_SHELL_CALL_ITEM_ID_KEY from agent_framework_openai._exceptions import OpenAIContentFilterException +load_dotenv() + skip_if_openai_integration_tests_disabled = pytest.mark.skipif( os.getenv("OPENAI_API_KEY", "") in ("", "test-dummy-key"), reason="No real OPENAI_API_KEY provided; skipping integration tests.", @@ -120,6 +124,15 @@ async def create_vector_store( if result.last_error is not None: raise Exception(f"Vector store file processing failed with status: {result.last_error.message}") + # Wait for the vector store index to be fully searchable. + # create_and_poll confirms file processing, but the search index is eventually consistent. + for _ in range(10): + vs = await client.client.vector_stores.retrieve(vector_store.id) + if vs.file_counts.completed >= 1 and vs.file_counts.in_progress == 0: + break + await asyncio.sleep(1) + await asyncio.sleep(2) + return file.id, Content.from_hosted_vector_store(vector_store_id=vector_store.id) @@ -4379,10 +4392,6 @@ async def test_integration_web_search() -> None: assert response.text is not None -@pytest.mark.skip( - reason="Unreliable due to OpenAI vector store indexing potential " - "race condition. See https://github.com/microsoft/agent-framework/issues/1669" -) @pytest.mark.flaky @pytest.mark.integration @skip_if_openai_integration_tests_disabled @@ -4413,10 +4422,6 @@ async def test_integration_file_search() -> None: assert "75" in response.text -@pytest.mark.skip( - reason="Unreliable due to OpenAI vector store indexing " - "potential race condition. See https://github.com/microsoft/agent-framework/issues/1669" -) @pytest.mark.flaky @pytest.mark.integration @skip_if_openai_integration_tests_disabled @@ -4428,14 +4433,15 @@ async def test_integration_streaming_file_search() -> None: file_id, vector_store = await create_vector_store(openai_responses_client) # Use static method for file search tool file_search_tool = OpenAIChatClient.get_file_search_tool(vector_store_ids=[vector_store.vector_store_id]) - # Test that the client will use the web search tool - response = openai_responses_client.get_streaming_response( + # Test that the client will use the file search tool + response = openai_responses_client.get_response( messages=[ Message( role="user", contents=["What is the weather today? Do a file search to find the answer."], ) ], + stream=True, options={ "tool_choice": "auto", "tools": [file_search_tool], diff --git a/python/packages/openai/tests/openai/test_openai_chat_client_azure.py b/python/packages/openai/tests/openai/test_openai_chat_client_azure.py index a5fdff72b5..b16fbd0f7f 100644 --- a/python/packages/openai/tests/openai/test_openai_chat_client_azure.py +++ b/python/packages/openai/tests/openai/test_openai_chat_client_azure.py @@ -355,7 +355,6 @@ async def test_integration_web_search() -> None: @pytest.mark.integration @skip_if_azure_openai_integration_tests_disabled @_with_azure_openai_debug() -@pytest.mark.skip(reason="Azure OpenAI with files raises 500 error. Needs investigation.") async def test_integration_client_file_search() -> None: async with AzureCliCredential() as credential: client = OpenAIChatClient(credential=credential) @@ -381,7 +380,6 @@ async def test_integration_client_file_search() -> None: @pytest.mark.integration @skip_if_azure_openai_integration_tests_disabled @_with_azure_openai_debug() -@pytest.mark.skip(reason="Azure OpenAI with files raises 500 error. Needs investigation.") async def test_integration_client_file_search_streaming() -> None: async with AzureCliCredential() as credential: client = OpenAIChatClient(credential=credential) diff --git a/python/samples/04-hosting/azure_functions/11_workflow_parallel/function_app.py b/python/samples/04-hosting/azure_functions/11_workflow_parallel/function_app.py index 7deea4211c..0669d95e7b 100644 --- a/python/samples/04-hosting/azure_functions/11_workflow_parallel/function_app.py +++ b/python/samples/04-hosting/azure_functions/11_workflow_parallel/function_app.py @@ -363,7 +363,7 @@ def _create_workflow() -> Workflow: chat_client = OpenAIChatCompletionClient( model=os.environ["AZURE_OPENAI_MODEL"], - api_key=get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default"), + credential=get_bearer_token_provider(credential, "https://cognitiveservices.azure.com/.default"), ) # Create agents for parallel analysis diff --git a/python/samples/04-hosting/durabletask/06_multi_agent_orchestration_conditionals/worker.py b/python/samples/04-hosting/durabletask/06_multi_agent_orchestration_conditionals/worker.py index 0b5f014873..2b1af9d441 100644 --- a/python/samples/04-hosting/durabletask/06_multi_agent_orchestration_conditionals/worker.py +++ b/python/samples/04-hosting/durabletask/06_multi_agent_orchestration_conditionals/worker.py @@ -70,7 +70,7 @@ def create_spam_agent() -> "Agent": return Agent( client=OpenAIChatCompletionClient( model=os.environ["AZURE_OPENAI_MODEL"], - api_key=get_async_bearer_token_provider( + credential=get_async_bearer_token_provider( AsyncAzureCliCredential(), "https://cognitiveservices.azure.com/.default" ), ), @@ -88,7 +88,7 @@ def create_email_agent() -> "Agent": return Agent( client=OpenAIChatCompletionClient( model=os.environ["AZURE_OPENAI_MODEL"], - api_key=get_async_bearer_token_provider( + credential=get_async_bearer_token_provider( AsyncAzureCliCredential(), "https://cognitiveservices.azure.com/.default" ), ), diff --git a/python/scripts/flaky_report/__init__.py b/python/scripts/flaky_report/__init__.py index e5a0eeb0ca..e3b0cc6de7 100644 --- a/python/scripts/flaky_report/__init__.py +++ b/python/scripts/flaky_report/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) Microsoft. All rights reserved. -"""Flaky test report aggregation and trend generation. +"""Integration test report aggregation and trend generation. Parses JUnit XML (``pytest.xml``) files produced by each CI job, merges them with historical data, and generates a markdown trend report showing diff --git a/python/scripts/flaky_report/__main__.py b/python/scripts/flaky_report/__main__.py index 89969baae6..c944e135f8 100644 --- a/python/scripts/flaky_report/__main__.py +++ b/python/scripts/flaky_report/__main__.py @@ -1,15 +1,15 @@ # Copyright (c) Microsoft. All rights reserved. -"""CLI entry point for the flaky test report tool. +"""CLI entry point for the integration test report tool. Usage: uv run python -m scripts.flaky_report Example (from python/ directory): uv run python -m scripts.flaky_report \\ - ../flaky-reports/ \\ - flaky-report-history.json \\ - flaky-test-report.md + ../test-results/ \\ + integration-report-history.json \\ + integration-test-report.md """ import sys diff --git a/python/scripts/flaky_report/aggregate.py b/python/scripts/flaky_report/aggregate.py index e07a5e136a..e803add730 100644 --- a/python/scripts/flaky_report/aggregate.py +++ b/python/scripts/flaky_report/aggregate.py @@ -247,7 +247,7 @@ def _short_name(nodeid: str) -> str: def generate_trend_report(runs: list[dict[str, Any]]) -> str: """Generate a markdown trend report from run history.""" lines = [ - "# 🔬 Flaky Test Report", + "# 🔬 Integration Test Report", "", f"*Generated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}*", "",