Spaces:

Archime
/

canary_aed_streaming

Running on Zero

App Files Files Community

Archime commited on Nov 14

Commit

7b84154

1 Parent(s): 010aaff

impl CanaryConfig from ui

Browse files

Files changed (2) hide show

app.py +31 -10
app/canary_speech_engine.py +114 -5

app.py CHANGED Viewed

@@ -52,25 +52,39 @@ reset_all_active_session_hash_code()
 theme,css_style = get_custom_theme()
-from omegaconf import OmegaConf
-cfg = OmegaConf.load('app/config.yaml')
 # logger.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
-from app.canary_speech_engine import CanarySpeechEngine
 from app.silero_vad_engine import Silero_Vad_Engine
 from app.streaming_audio_processor import StreamingAudioProcessor,StreamingAudioProcessorConfig
-asr_model = nemo_asr.models.ASRModel.from_pretrained(cfg.pretrained_name)
-canary_speech_engine = CanarySpeechEngine(asr_model,cfg)
-silero_vad_engine = Silero_Vad_Engine()
 streaming_audio_processor_config = StreamingAudioProcessorConfig(
     read_size=4000,
     silence_threshold_chunks=1
 )
-streamer = StreamingAudioProcessor(speech_engine=canary_speech_engine,vad_engine=silero_vad_engine,cfg=streaming_audio_processor_config)
 @spaces.GPU
-def task(session_id: str):
     """Continuously read and delete .npz chunks while task is active."""
     active_flag = get_active_task_flag_file(session_id)
     with open(active_flag, "w") as f:
         f.write("1")
@@ -319,6 +333,7 @@ with gr.Blocks(theme=theme, css=css_style) as demo:
                         interactive=False,
                         visible=False
                     )
                     stop_stream_button = gr.Button("Stop Streaming", visible=False)
                     transcription_output = gr.Textbox(
@@ -365,9 +380,15 @@ with gr.Blocks(theme=theme, css=css_style) as demo:
                         accumulated = ""
                         yield f"Starting {task_type.lower()}...\n\n",gr.update(visible=False),gr.update(visible=True)
                         # Boucle sur le générateur de `task()`
-                        for msg in task(session_hash_code):
                             accumulated += msg
                             yield accumulated,gr.update(visible=False),gr.update(visible=True)

 theme,css_style = get_custom_theme()
 # logger.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
+from app.canary_speech_engine import CanarySpeechEngine,CanaryConfig
 from app.silero_vad_engine import Silero_Vad_Engine
 from app.streaming_audio_processor import StreamingAudioProcessor,StreamingAudioProcessorConfig
+asr_model = nemo_asr.models.ASRModel.from_pretrained("nvidia/canary-1b-v2")
 streaming_audio_processor_config = StreamingAudioProcessorConfig(
     read_size=4000,
     silence_threshold_chunks=1
 )
 @spaces.GPU
+def task(session_id: str,
+        task_type, lang_source, lang_target,
+        chunk_secs, left_context_secs, right_context_secs,
+        streaming_policy, alignatt_thr, waitk_lagging,
+        exclude_sink_frames, xatt_scores_layer, hallucinations_detector
+         ):
     """Continuously read and delete .npz chunks while task is active."""
+    yield f"initializing the CanarySpeechEngine and Silero_Vad_Engine\n\n"
+    # initialize the CanarySpeechEngine and Silero_Vad_Engine
+    conf = CanaryConfig.from_params(
+        task_type, lang_source, lang_target,
+        chunk_secs, left_context_secs, right_context_secs,
+        streaming_policy, alignatt_thr, waitk_lagging,
+        exclude_sink_frames, xatt_scores_layer, hallucinations_detector
+    )
+    canary_speech_engine = CanarySpeechEngine(asr_model,conf)
+    silero_vad_engine = Silero_Vad_Engine()
+    streamer = StreamingAudioProcessor(speech_engine=canary_speech_engine,vad_engine=silero_vad_engine,cfg=streaming_audio_processor_config)
+    yield f"initialized the CanarySpeechEngine and Silero_Vad_Engine\n\n"
+    yield f"Task started for session {session_id}\n\n"
     active_flag = get_active_task_flag_file(session_id)
     with open(active_flag, "w") as f:
         f.write("1")
                         interactive=False,
                         visible=False
                     )
                     stop_stream_button = gr.Button("Stop Streaming", visible=False)
                     transcription_output = gr.Textbox(
                         accumulated = ""
                         yield f"Starting {task_type.lower()}...\n\n",gr.update(visible=False),gr.update(visible=True)
                         # Boucle sur le générateur de `task()`
+                        for msg in task(
+                            session_hash_code,
+                            task_type, lang_source, lang_target,
+                            chunk_secs, left_context_secs, right_context_secs,
+                            streaming_policy, alignatt_thr, waitk_lagging,
+                            exclude_sink_frames, xatt_scores_layer, hallucinations_detector
+                                        ):
                             accumulated += msg
                             yield accumulated,gr.update(visible=False),gr.update(visible=True)

app/canary_speech_engine.py CHANGED Viewed

@@ -32,6 +32,115 @@ from app.logger_config import (
 )
 def make_divisible_by(num: int, factor: int) -> int:
     """Make num divisible by factor"""
     return (num // factor) * factor
@@ -42,18 +151,18 @@ class CanarySpeechEngine(IStreamingSpeechEngine):
     Encapsulates the state and logic for streaming audio transcription
     using an internally loaded Canary model.
     """
-    def __init__(self,asr_model, cfg: OmegaConf):
         """
         Initializes the speech engine and loads the ASR model.
         Args:
             cfg: An OmegaConf object containing 'model' and 'streaming' configs.
         """
-        self.cfg = cfg # Store the full config
         # Setup device and dtype from config
-        self.map_location = get_inference_device(cuda=self.cfg.cuda, allow_mps=self.cfg.allow_mps)
-        self.compute_dtype = get_inference_dtype(self.cfg.compute_dtype, device=self.map_location)
         logging.info(f"Inference will be on device: {self.map_location} with dtype: {self.compute_dtype}")
         # Load the model internally
@@ -71,7 +180,7 @@ class CanarySpeechEngine(IStreamingSpeechEngine):
     def _setup_model(self,asr_model, model_cfg: OmegaConf, map_location: str):
         """Loads the pretrained ASR model and configures it for inference."""
-        logging.info(f"Loading model {model_cfg.pretrained_name}...")
         start_time = time.time()
         try:
             asr_model = asr_model.to(map_location)

 )
+from dataclasses import dataclass
+from typing import Optional, Literal
+@dataclass
+class CanaryConfig:
+    chunk_secs: float = 1.0
+    left_context_secs: float = 20.0
+    right_context_secs: float = 0.5
+    cuda: Optional[bool] = None
+    allow_mps: bool = True
+    compute_dtype: Optional[str] = None
+    matmul_precision: str = "high"
+    batch_size= 1
+    decoding: dict = None
+    streaming_policy: str = "alignatt"
+    alignatt_thr: float = 8.0
+    waitk_lagging: int = 2
+    exclude_sink_frames: int = 8
+    xatt_scores_layer: int = -2
+    max_tokens_per_alignatt_step: int = 30
+    max_generation_length: int = 512
+    use_avgpool_for_alignatt: bool = False
+    hallucinations_detector: bool = True
+    prompt: dict = None
+    pnc: str = "no"
+    task: str = "asr"
+    source_lang: str = "fr"
+    target_lang: str = "fr"
+    timestamps: bool = True
+    def __post_init__(self):
+        if self.decoding is None:
+            self.decoding = {
+                "streaming_policy": self.streaming_policy,
+                "alignatt_thr": self.alignatt_thr,
+                "waitk_lagging": self.waitk_lagging,
+                "exclude_sink_frames": self.exclude_sink_frames,
+                "xatt_scores_layer": self.xatt_scores_layer,
+                "max_tokens_per_alignatt_step": self.max_tokens_per_alignatt_step,
+                "max_generation_length": self.max_generation_length,
+                "use_avgpool_for_alignatt": self.use_avgpool_for_alignatt,
+                "hallucinations_detector": self.hallucinations_detector
+            }
+        if self.prompt is None:
+            self.prompt = {
+                "pnc": self.pnc,
+                "task": self.task,
+                "source_lang": self.source_lang,
+                "target_lang": self.target_lang,
+                "timestamps": self.timestamps
+            }
+    def toOmegaConf(self) -> OmegaConf:
+        """Convert the config to OmegaConf format"""
+        config_dict = {
+            "chunk_secs": self.chunk_secs,
+            "left_context_secs": self.left_context_secs,
+            "right_context_secs": self.right_context_secs,
+            "cuda": self.cuda,
+            "allow_mps": self.allow_mps,
+            "compute_dtype": self.compute_dtype,
+            "matmul_precision": self.matmul_precision,
+            "batch_size": self.batch_size,
+            "decoding": self.decoding,
+            "prompt": self.prompt
+        }
+        # Remove None values
+        filtered_dict = {k: v for k, v in config_dict.items() if v is not None}
+        return OmegaConf.create(filtered_dict)
+    @classmethod
+    def from_params(
+        cls,
+        task_type: str,
+        source_lang: str,
+        target_lang: str,
+        chunk_secs: float = 1.0,
+        left_context_secs: float = 20.0,
+        right_context_secs: float = 0.5,
+        streaming_policy: str = "alignatt",
+        alignatt_thr: float = 8.0,
+        waitk_lagging: int = 2,
+        exclude_sink_frames: int = 8,
+        xatt_scores_layer: int = -2,
+        hallucinations_detector: bool = True
+    ):
+        """Create a CanaryConfig instance from parameters"""
+        # Convert task type to model task
+        task = "asr" if task_type == "Transcription" else "ast"
+        return cls(
+            chunk_secs=chunk_secs,
+            left_context_secs=left_context_secs,
+            right_context_secs=right_context_secs,
+            streaming_policy=streaming_policy,
+            alignatt_thr=alignatt_thr,
+            waitk_lagging=waitk_lagging,
+            exclude_sink_frames=exclude_sink_frames,
+            xatt_scores_layer=xatt_scores_layer,
+            hallucinations_detector=hallucinations_detector,
+            task=task,
+            source_lang=source_lang,
+            target_lang=target_lang
+        )
 def make_divisible_by(num: int, factor: int) -> int:
     """Make num divisible by factor"""
     return (num // factor) * factor
     Encapsulates the state and logic for streaming audio transcription
     using an internally loaded Canary model.
     """
+    def __init__(self,asr_model, cfg: CanaryConfig):
         """
         Initializes the speech engine and loads the ASR model.
         Args:
             cfg: An OmegaConf object containing 'model' and 'streaming' configs.
         """
+        self.cfg = cfg.toOmegaConf() # Store the full config
         # Setup device and dtype from config
+        self.map_location = get_inference_device(cuda=None, allow_mps=self.cfg.allow_mps)
+        self.compute_dtype = get_inference_dtype(None, device=self.map_location)
         logging.info(f"Inference will be on device: {self.map_location} with dtype: {self.compute_dtype}")
         # Load the model internally
     def _setup_model(self,asr_model, model_cfg: OmegaConf, map_location: str):
         """Loads the pretrained ASR model and configures it for inference."""
+        logging.info(f"Loading model ...")
         start_time = time.time()
         try:
             asr_model = asr_model.to(map_location)