Spaces:

MartinHummel
/

FinalAssignment

Sleeping

App Files Files Community

MartinHummel commited on May 14

Commit

7f63349

1 Parent(s): e327cd6

agent prompt

Browse files

Files changed (2) hide show

agent/gaia_agent.py +34 -4
local_benchmark.py +26 -0

agent/gaia_agent.py CHANGED Viewed

@@ -15,7 +15,7 @@ from langchain_community.chat_models import ChatOpenAI
 def create_langchain_agent() -> AgentExecutor:
     llm = ChatOpenAI(
-        model_name="gpt-4o",  # Or "gpt-3.5-turbo"
         temperature=0.1,
         openai_api_key=os.getenv("OPENAI_API_KEY"),
     )
@@ -29,14 +29,44 @@ def create_langchain_agent() -> AgentExecutor:
         Tool(name="vegetable_classifier_2022", func=vegetable_classifier_2022, description="Classify and extract only vegetables, excluding botanical fruits, based on a comma-separated list of food items."),
         Tool(name="excel_food_sales_sum", func=excel_food_sales_sum, description="Parse uploaded Excel file and return total food-related sales."),
     ]
     agent = initialize_agent(
         tools=tools,
         llm=llm,
-        agent=AgentType.OPENAI_FUNCTIONS,
-        verbose=True
     )
     '''
     agent = initialize_agent(
         tools=tools,

 def create_langchain_agent() -> AgentExecutor:
     llm = ChatOpenAI(
+        model_name="gpt-4o",
         temperature=0.1,
         openai_api_key=os.getenv("OPENAI_API_KEY"),
     )
         Tool(name="vegetable_classifier_2022", func=vegetable_classifier_2022, description="Classify and extract only vegetables, excluding botanical fruits, based on a comma-separated list of food items."),
         Tool(name="excel_food_sales_sum", func=excel_food_sales_sum, description="Parse uploaded Excel file and return total food-related sales."),
     ]
+    agent_kwargs = {
+        "prefix": (
+            "You are a helpful AI assistant completing GAIA benchmark tasks.\n"
+            "You MUST use the tools provided to answer the user's question. Do not answer from your own knowledge.\n"
+            "Carefully analyze the question to determine the most appropriate tool to use.\n"
+            "Here are guidelines for using the tools:\n"
+            "- Use 'wikipedia_search' to find factual information about topics, events, people, etc. (e.g., 'Use wikipedia_search to find the population of France').\n"
+            "- Use 'youtube_transcript' to extract transcripts from YouTube videos when the question requires understanding the video content. (e.g., 'Use youtube_transcript to summarize the key points of this video').\n"
+            "- Use 'audio_transcriber' to transcribe uploaded audio files. (e.g., 'Use audio_transcriber to get the text from this audio recording').\n"
+            "- Use 'chess_image_solver' to analyze and solve chess puzzles from images. (e.g., 'Use chess_image_solver to determine the best move in this chess position').\n"
+            "- Use 'file_parser' to parse and analyze data from Excel or CSV files. (e.g., 'Use file_parser to calculate the average sales from this data').\n"
+            "- Use 'vegetable_classifier_2022' to classify a list of food items and extract only the vegetables. (e.g., 'Use vegetable_classifier_2022 to get a list of the vegetables in this grocery list').\n"
+            "- Use 'excel_food_sales_sum' to extract total food sales from excel files. (e.g., 'Use excel_food_sales_sum to calculate the total food sales').\n"
+            "Do NOT guess or make up answers. If a tool cannot provide the answer, truthfully respond that you were unable to find the information.\n"
+        ),
+        "suffix": (
+            "Use the tools to research or calculate the answer.\n"
+            "If a tool fails, explain the reason for the failure instead of hallucinating an answer.\n"
+            "Provide concise and direct answers as requested in the questions. Do not add extra information unless explicitly asked for.\n"
+            "For example, if asked for a number, return only the number. If asked for a list, return only the list.\n"
+        ),
+    }
     agent = initialize_agent(
         tools=tools,
         llm=llm,
+        agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
+        handle_parsing_errors=True,
+        verbose=True,
+        max_iterations=10,
+        max_execution_time=60,
+        agent_kwargs=agent_kwargs  # Place the agent_kwargs here
     )
+    return agent
     '''
     agent = initialize_agent(
         tools=tools,

local_benchmark.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from agent.gaia_agent import create_langchain_agent
+questions = [
+    "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)?",
+    "In the video https://www.youtube.com/watch?v=u1xXCYZ4VYM, what is the highest number of bird species to be on screen at once?",
+    "Reverse the string 'etisoppo eht'.",
+    "What country had the least number of athletes at the 1928 Summer Olympics? Return the IOC country code.",
+    "From the chessboard image at path 'chess_1.png', what is the best move?",
+    "The attached Excel file contains food and drink sales. What are the total sales for food (excluding drinks)?",
+    "Give me a comma-separated, alphabetized list of botanical vegetables from this: milk, eggs, flour, plums, lettuce, celery, broccoli, bell pepper, zucchini.",
+    "Where were the Vietnamese specimens described by Kuznetzov in Nedoshivina’s 2010 paper eventually deposited? (City name only.)",
+    "What is the name of the novel where a Martian child grows up on Earth and founds a church?",
+    "Summarize the Wikipedia page on 'Taishō Tamai'."
+]
+agent = create_langchain_agent()
+print("Running local benchmark...")
+for idx, question in enumerate(questions):
+    print(f"\nQUESTION {idx + 1}: {question}")
+    try:
+        answer = agent.invoke({"input": question})
+        print("ANSWER:", answer)
+    except Exception as e:
+        print("❌ Error:", e)