Spaces:

anfastech
/

slaq-version-c-ai-enginee

Running

App Files Files Community

anfastech commited on 13 days ago

Commit

439ae4d

1 Parent(s): 220224d

Fix: changes to app.py and detect_stuttering.py

Browse files

Files changed (2) hide show

app.py +7 -3
diagnosis/ai_engine/detect_stuttering.py +59 -7

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import logging
 import os
 import sys
 from pathlib import Path
-from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
@@ -70,7 +70,7 @@ async def health_check():
 @app.post("/analyze")
 async def analyze_audio(
     audio: UploadFile = File(...),
-    transcript: str = ""
 ):
     """
     Analyze audio file for stuttering
@@ -102,10 +102,14 @@ async def analyze_audio(
         logger.info(f"📂 Saved to: {temp_file} ({len(content) / 1024 / 1024:.2f} MB)")
         # Analyze
-        logger.info(f"🔄 Analyzing audio with transcript: '{transcript[:50]}...'")
         result = detector.analyze_audio(temp_file, transcript)
         logger.info(f"✅ Analysis complete: severity={result['severity']}, mismatch={result['mismatch_percentage']}%")
         return result
     except HTTPException:

 import os
 import sys
 from pathlib import Path
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 @app.post("/analyze")
 async def analyze_audio(
     audio: UploadFile = File(...),
+    transcript: str = Form("")
 ):
     """
     Analyze audio file for stuttering
         logger.info(f"📂 Saved to: {temp_file} ({len(content) / 1024 / 1024:.2f} MB)")
         # Analyze
+        logger.info(f"🔄 Analyzing audio with transcript: '{transcript[:50] if transcript else '(empty)'}...'")
         result = detector.analyze_audio(temp_file, transcript)
+        # Log transcript values from result
+        actual = result.get('actual_transcript', '')
+        target = result.get('target_transcript', '')
         logger.info(f"✅ Analysis complete: severity={result['severity']}, mismatch={result['mismatch_percentage']}%")
+        logger.info(f"📝 Result transcripts - Actual: '{actual[:100]}' (len: {len(actual)}), Target: '{target[:100]}' (len: {len(target)})")
         return result
     except HTTPException:

diagnosis/ai_engine/detect_stuttering.py CHANGED Viewed

@@ -155,6 +155,14 @@ class AdvancedStutterDetector:
                 torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
             ).to(DEVICE)
             self.model.eval()
             self.loaded_adapters = set()  # Keep for backward compatibility but not used with indicwav2vec
             # Anomaly Detection Model (for outlier stutter events)
@@ -320,8 +328,45 @@ class AdvancedStutterDetector:
                 logits = outputs.logits
                 predicted_ids = torch.argmax(logits, dim=-1)
-            # Decode transcript
-            transcript = self.processor.batch_decode(predicted_ids)[0]
             # Estimate word timestamps (simplified - frame-level alignment)
             frame_duration = 0.02  # 20ms per frame
@@ -329,9 +374,9 @@ class AdvancedStutterDetector:
             audio_duration = len(audio) / 16000
             # Simple word-level timestamps (would need proper alignment for production)
-            words = transcript.split()
             word_timestamps = []
-            time_per_word = audio_duration / max(len(words), 1)
             for i, word in enumerate(words):
                 word_timestamps.append({
@@ -342,7 +387,7 @@ class AdvancedStutterDetector:
             return transcript, word_timestamps, logits
         except Exception as e:
-            logger.error(f"Transcription failed: {e}")
             return "", [], torch.zeros((1, 100, 32))  # Dummy return
     def _calculate_uncertainty(self, logits: torch.Tensor) -> Tuple[float, List[Dict]]:
@@ -686,6 +731,7 @@ class AdvancedStutterDetector:
         # === STEP 4: Wav2Vec2 Transcription & Uncertainty ===
         transcript, word_timestamps, logits = self._transcribe_with_timestamps(audio)
         entropy_score, low_conf_regions = self._calculate_uncertainty(logits)
         # === STEP 5: Speaking Rate Estimation ===
@@ -759,9 +805,15 @@ class AdvancedStutterDetector:
             metrics['severity_score'] = max(metrics['severity_score'], 5.0)
         # === STEP 9: Return Comprehensive Report ===
         return {
-            'actual_transcript': transcript,
-            'target_transcript': transcript,
             'mismatched_chars': [f"{r['time']}s" for r in low_conf_regions],
             'mismatch_percentage': metrics['severity_score'],
             'ctc_loss_score': round(entropy_score, 4),

                 torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
             ).to(DEVICE)
             self.model.eval()
+            # Debug: Log processor structure
+            logger.info(f"📋 Processor type: {type(self.processor)}")
+            logger.info(f"📋 Processor attributes: {[attr for attr in dir(self.processor) if not attr.startswith('_')]}")
+            if hasattr(self.processor, 'tokenizer'):
+                logger.info(f"📋 Tokenizer type: {type(self.processor.tokenizer)}")
+            if hasattr(self.processor, 'feature_extractor'):
+                logger.info(f"📋 Feature extractor type: {type(self.processor.feature_extractor)}")
             self.loaded_adapters = set()  # Keep for backward compatibility but not used with indicwav2vec
             # Anomaly Detection Model (for outlier stutter events)
                 logits = outputs.logits
                 predicted_ids = torch.argmax(logits, dim=-1)
+            # Decode transcript - IndicWav2Vec uses tokenizer for decoding
+            transcript = ""
+            try:
+                # Method 1: Try using processor's tokenizer directly
+                if hasattr(self.processor, 'tokenizer'):
+                    transcript = self.processor.tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
+                    logger.info(f"📝 Decoded via tokenizer: '{transcript}' (length: {len(transcript)})")
+                # Method 2: Try batch_decode if tokenizer not available
+                elif hasattr(self.processor, 'batch_decode'):
+                    transcript = self.processor.batch_decode(predicted_ids)[0]
+                    logger.info(f"📝 Decoded via batch_decode: '{transcript}' (length: {len(transcript)})")
+                # Method 3: Try accessing tokenizer through processor.feature_extractor or processor attributes
+                else:
+                    # Check if processor wraps a tokenizer
+                    for attr in ['tokenizer', '_tokenizer', 'decoder']:
+                        if hasattr(self.processor, attr):
+                            tokenizer = getattr(self.processor, attr)
+                            if hasattr(tokenizer, 'decode'):
+                                transcript = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
+                                logger.info(f"📝 Decoded via {attr}: '{transcript}' (length: {len(transcript)})")
+                                break
+                # Clean up transcript - remove special tokens and normalize
+                if transcript:
+                    transcript = transcript.strip()
+                    # Remove common special tokens if present
+                    transcript = transcript.replace('<pad>', '').replace('<s>', '').replace('</s>', '').replace('|', ' ').strip()
+                    # Normalize whitespace
+                    transcript = ' '.join(transcript.split())
+            except Exception as decode_error:
+                logger.error(f"⚠️ Decode error: {decode_error}", exc_info=True)
+                transcript = ""
+            # Ensure transcript is not None
+            if not transcript:
+                transcript = ""
+                logger.warning("⚠️ Empty transcript generated - model may not have produced valid output")
+                logger.warning(f"⚠️ Predicted IDs shape: {predicted_ids.shape}, sample values: {predicted_ids[0][:10].tolist() if predicted_ids.numel() > 0 else 'empty'}")
             # Estimate word timestamps (simplified - frame-level alignment)
             frame_duration = 0.02  # 20ms per frame
             audio_duration = len(audio) / 16000
             # Simple word-level timestamps (would need proper alignment for production)
+            words = transcript.split() if transcript else []
             word_timestamps = []
+            time_per_word = audio_duration / max(len(words), 1) if words else 0
             for i, word in enumerate(words):
                 word_timestamps.append({
             return transcript, word_timestamps, logits
         except Exception as e:
+            logger.error(f"❌ Transcription failed: {e}", exc_info=True)
             return "", [], torch.zeros((1, 100, 32))  # Dummy return
     def _calculate_uncertainty(self, logits: torch.Tensor) -> Tuple[float, List[Dict]]:
         # === STEP 4: Wav2Vec2 Transcription & Uncertainty ===
         transcript, word_timestamps, logits = self._transcribe_with_timestamps(audio)
+        logger.info(f"📝 Main transcription result: '{transcript}' (length: {len(transcript)}, words: {len(word_timestamps)})")
         entropy_score, low_conf_regions = self._calculate_uncertainty(logits)
         # === STEP 5: Speaking Rate Estimation ===
             metrics['severity_score'] = max(metrics['severity_score'], 5.0)
         # === STEP 9: Return Comprehensive Report ===
+        # Ensure transcripts are not None
+        actual_transcript = transcript if transcript else ""
+        target_transcript = proper_transcript if proper_transcript else transcript if transcript else ""
+        logger.info(f"📝 Final return - Actual: '{actual_transcript}' (len: {len(actual_transcript)}), Target: '{target_transcript}' (len: {len(target_transcript)})")
         return {
+            'actual_transcript': actual_transcript,
+            'target_transcript': target_transcript,
             'mismatched_chars': [f"{r['time']}s" for r in low_conf_regions],
             'mismatch_percentage': metrics['severity_score'],
             'ctc_loss_score': round(entropy_score, 4),