# promptfooconfig.yaml # -------------------- # Configuration for evaluating the AI Drilling Copilot Agents # # NOTE: This rubric is completely customizable! # You can tweak the test cases, prompts, and evaluation rules perfectly # to match the SPE GCS 2026 ML Challenge evaluation criteria. description: "SPE GCS 2026: Agent Rubric Evaluation" providers: # Using Promptfoo's native Google provider. 3.1 is not fully supported by the npm plugin yet. - id: google:gemini-2.5-flash-preview label: "baseline-agent-model" prompts: - file://tests/prompts/analyst_prompt.txt - file://tests/prompts/historian_prompt.txt - file://tests/prompts/auditor_prompt.txt - file://tests/prompts/lead_prompt.txt tests: - vars: question: "Which hole section in well 15/9-19 B was the most challenging to drill?" context: "DDR data shows NPT of 45 hours in the 12.25 inch section due to severe losses. WITSML confirms high torque fluctuations." assert: - type: "icontains" value: "12.25" - type: "llm-rubric" value: "The response MUST explicitly state a 'Confidence Level' or 'Uncertainty'." - type: "llm-rubric" value: "The response must clearly state the evidence (either data or reports) used to make the conclusion." - vars: question: "What were the lessons learned regarding weather-induced NPT?" context: "Historical Volve data indicates waiting on weather (WOW) caused 15% of all delays, particularly stalling riser pulling operations." assert: - type: "llm-rubric" value: "The response must synthesize the context to identify actionable lessons learned, not just repeat the data." - type: "not-icontains" value: "As an AI language model"