Spaces:

jedick
/

R-help-chat

Running on Zero

App Files Files Community

jedick commited on Aug 2

Commit

7e18a82

1 Parent(s): f42e9e5

Enable FlashAttention

Browse files

Files changed (5) hide show

graph.py +42 -22
main.py +6 -1
pipeline.py +1 -0
requirements.txt +3 -0
retriever.py +2 -0

graph.py CHANGED Viewed

@@ -43,8 +43,18 @@ def print_message_summaries(messages, header):
         print(f"{type_txt}: {summary_txt}")
-def normalize_messages(messages):
-    """Normalize messages to sequence of types expected by chat models"""
     # Copy the most recent HumanMessage to the end
     # - Avoids SmolLM and Qwen ValueError: Last message must be a HumanMessage!
     if not type(messages[-1]) is HumanMessage:
@@ -88,6 +98,10 @@ def normalize_messages(messages):
         if not hasattr(msg, "tool_calls")
         or (hasattr(msg, "tool_calls") and not msg.tool_calls)
     ]
     return messages
@@ -118,6 +132,7 @@ def BuildGraph(
     top_k=6,
     think_query=False,
     think_answer=False,
     embedding_ckpt_dir=None,
 ):
     """
@@ -128,8 +143,10 @@ def BuildGraph(
         compute_mode: remote or local (for retriever)
         search_type: dense, sparse, or hybrid (for retriever)
         top_k: number of documents to retrieve
-        think_query: Whether to use thinking mode for the query
-        think_answer: Whether to use thinking mode for the answer
     Based on:
         https://python.langchain.com/docs/how_to/qa_sources
@@ -175,10 +192,10 @@ def BuildGraph(
         Use optional "months" argument to search by month.
         Args:
-            search_query: Search query (required)
-            months: One or more months (optional)
-            start_year: Starting year for emails (optional)
-            end_year: Ending year for emails (optional)
         """
         retriever = BuildRetriever(
             compute_mode, search_type, top_k, start_year, end_year, embedding_ckpt_dir
@@ -208,8 +225,8 @@ def BuildGraph(
         An answer to the question, with citations of the emails used (senders and dates).
         Args:
-            answer: An answer to the question
-            citations: Citations of emails used to answer the question, e.g. Jane Doe, 2025-07-04; John Smith, 2020-01-01
         """
         return answer, citations
@@ -220,8 +237,14 @@ def BuildGraph(
         query_model = ToolifyHF(
             chat_model, query_prompt(chat_model, think=think_query)
         ).bind_tools([retrieve_emails])
-        # Don't use answer_with_citations tool because responses with are sometimes unparseable
-        answer_model = chat_model
     else:
         # For remote model (OpenAI API)
         query_model = chat_model.bind_tools([retrieve_emails])
@@ -235,9 +258,7 @@ def BuildGraph(
         if is_local:
             # Don't include the system message here because it's defined in ToolCallingLLM
             messages = state["messages"]
-            # print_message_summaries(messages, "--- query: before normalization ---")
-            messages = normalize_messages(messages)
-            # print_message_summaries(messages, "--- query: after normalization ---")
         else:
             messages = [SystemMessage(query_prompt(chat_model))] + state["messages"]
         response = query_model.invoke(messages)
@@ -248,13 +269,12 @@ def BuildGraph(
         """Generates an answer with the chat model"""
         if is_local:
             messages = state["messages"]
-            # print_message_summaries(messages, "--- answer: before normalization ---")
-            messages = normalize_messages(messages)
-            # Add the system message here because we're not using tools
-            messages = [
-                SystemMessage(answer_prompt(chat_model, think=think_answer))
-            ] + messages
-            # print_message_summaries(messages, "--- answer: after normalization ---")
         else:
             messages = [
                 SystemMessage(answer_prompt(chat_model, with_tools=True))

         print(f"{type_txt}: {summary_txt}")
+def normalize_messages(messages, summaries_for=None):
+    """
+    Normalize messages to sequence of types expected by chat models
+    Args:
+        messages (list): message list
+        summaries_for (str): "query" or "answer" to print messages summaries or None for no summaries
+    """
+    if summaries_for:
+        print_message_summaries(
+            messages, f"--- {summaries_for}: before normalization ---"
+        )
     # Copy the most recent HumanMessage to the end
     # - Avoids SmolLM and Qwen ValueError: Last message must be a HumanMessage!
     if not type(messages[-1]) is HumanMessage:
         if not hasattr(msg, "tool_calls")
         or (hasattr(msg, "tool_calls") and not msg.tool_calls)
     ]
+    if summaries_for:
+        print_message_summaries(
+            messages, f"--- {summaries_for}: after normalization ---"
+        )
     return messages
     top_k=6,
     think_query=False,
     think_answer=False,
+    local_citations=True,
     embedding_ckpt_dir=None,
 ):
     """
         compute_mode: remote or local (for retriever)
         search_type: dense, sparse, or hybrid (for retriever)
         top_k: number of documents to retrieve
+        think_query: Whether to use thinking mode for the query (local model)
+        think_answer: Whether to use thinking mode for the answer (local model)
+        local_citations: Whether to use answer_with_citations() tool (local model)
+        embedding_ckpt_dir: Directory for embedding model checkpoint
     Based on:
         https://python.langchain.com/docs/how_to/qa_sources
         Use optional "months" argument to search by month.
         Args:
+            search_query (str): Search query
+            start_year (int, optional): Starting year for emails
+            end_year (int, optional): Ending year for emails
+            months (str, optional): One or more months separated by spaces
         """
         retriever = BuildRetriever(
             compute_mode, search_type, top_k, start_year, end_year, embedding_ckpt_dir
         An answer to the question, with citations of the emails used (senders and dates).
         Args:
+            answer (str): An answer to the question
+            citations (str): Citations of emails used to answer the question, e.g. Jane Doe, 2025-07-04; John Smith, 2020-01-01
         """
         return answer, citations
         query_model = ToolifyHF(
             chat_model, query_prompt(chat_model, think=think_query)
         ).bind_tools([retrieve_emails])
+        if local_citations:
+            answer_model = ToolifyHF(
+                chat_model,
+                answer_prompt(chat_model, think=think_answer, with_tools=True),
+            ).bind_tools([answer_with_citations])
+        else:
+            # Don't use answer_with_citations tool because responses with are sometimes unparseable
+            answer_model = chat_model
     else:
         # For remote model (OpenAI API)
         query_model = chat_model.bind_tools([retrieve_emails])
         if is_local:
             # Don't include the system message here because it's defined in ToolCallingLLM
             messages = state["messages"]
+            messages = normalize_messages(messages, "query")
         else:
             messages = [SystemMessage(query_prompt(chat_model))] + state["messages"]
         response = query_model.invoke(messages)
         """Generates an answer with the chat model"""
         if is_local:
             messages = state["messages"]
+            messages = normalize_messages(messages, "answer")
+            if not local_citations:
+                # Add the system message here if we're not using tools
+                messages = [
+                    SystemMessage(answer_prompt(chat_model, think=think_answer))
+                ] + messages
         else:
             messages = [
                 SystemMessage(answer_prompt(chat_model, with_tools=True))

main.py CHANGED Viewed

@@ -154,6 +154,10 @@ def GetChatModel(compute_mode, ckpt_dir=None):
             id_or_dir,
             # We need this to load the model in BF16 instead of fp32 (torch.float)
             torch_dtype=torch.bfloat16,
         )
         # Use MyTextGenerationPipeline with custom preprocess() method
@@ -164,7 +168,8 @@ def GetChatModel(compute_mode, ckpt_dir=None):
             return_full_text=False,
             # It seems that max_new_tokens has to be specified here, not in .invoke()
             max_new_tokens=2000,
-            # Use padding for FlashAttention alignment
             # https://github.com/google-deepmind/gemma/issues/169
             padding="longest",
         )

             id_or_dir,
             # We need this to load the model in BF16 instead of fp32 (torch.float)
             torch_dtype=torch.bfloat16,
+            # Enable FlashAttention (requires pip install flash-attn)
+            # https://huggingface.co/docs/transformers/en/attention_interface
+            # https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2
+            attn_implementation="flash_attention_2",
         )
         # Use MyTextGenerationPipeline with custom preprocess() method
             return_full_text=False,
             # It seems that max_new_tokens has to be specified here, not in .invoke()
             max_new_tokens=2000,
+            # Use padding for proper alignment for FlashAttention
+            # Part of fix for: "RuntimeError: p.attn_bias_ptr is not correctly aligned"
             # https://github.com/google-deepmind/gemma/issues/169
             padding="longest",
         )

pipeline.py CHANGED Viewed

@@ -8,6 +8,7 @@ class MyTextGenerationPipeline(TextGenerationPipeline):
     This subclass overrides the preprocess method to add pad_to_multiple_of=8 to tokenizer_kwargs.
     Fix for: "RuntimeError: p.attn_bias_ptr is not correctly aligned"
     https://github.com/google-deepmind/gemma/issues/169
     """
     def preprocess(

     This subclass overrides the preprocess method to add pad_to_multiple_of=8 to tokenizer_kwargs.
     Fix for: "RuntimeError: p.attn_bias_ptr is not correctly aligned"
     https://github.com/google-deepmind/gemma/issues/169
+    NOTE: we also need padding="longest", which is set during class instantiation
     """
     def preprocess(

requirements.txt CHANGED Viewed

@@ -4,6 +4,9 @@ chromadb==0.6.3
 # NOTE: chromadb==1.0.13 was giving intermittent error:
 #   ValueError('Could not connect to tenant default_tenant. Are you sure it exists?')
 # Stated requirements:
 #   Gemma 3: transformers>=4.50
 #   Qwen3:   transformers>=4.51

 # NOTE: chromadb==1.0.13 was giving intermittent error:
 #   ValueError('Could not connect to tenant default_tenant. Are you sure it exists?')
+# FlashAttention
+flash-attn==2.8.2
 # Stated requirements:
 #   Gemma 3: transformers>=4.50
 #   Qwen3:   transformers>=4.51

retriever.py CHANGED Viewed

@@ -49,6 +49,7 @@ def BuildRetriever(
         top_k: Number of documents to retrieve for "dense" and "sparse"
         start_year: Start year (optional)
         end_year: End year (optional)
     """
     if search_type == "dense":
         if not (start_year or end_year):
@@ -134,6 +135,7 @@ def BuildRetrieverDense(compute_mode: str, top_k=6, embedding_ckpt_dir=None):
     Args:
         compute_mode: Compute mode for embeddings (remote or local)
         top_k: Number of documents to retrieve
     """
     # Don't try to use local models without a GPU

         top_k: Number of documents to retrieve for "dense" and "sparse"
         start_year: Start year (optional)
         end_year: End year (optional)
+        embedding_ckpt_dir: Directory for embedding model checkpoint
     """
     if search_type == "dense":
         if not (start_year or end_year):
     Args:
         compute_mode: Compute mode for embeddings (remote or local)
         top_k: Number of documents to retrieve
+        embedding_ckpt_dir: Directory for embedding model checkpoint
     """
     # Don't try to use local models without a GPU