Spaces:

jaykishan-b
/

speech-analysis

Runtime error

App Files Files Community

jaykishan-b commited on Oct 30, 2024

Commit

79b7942

1 Parent(s): ddddf1b

init

Browse files

Files changed (21) hide show

.gitignore +61 -0
Dockerfile +21 -0
__init__.py +0 -0
app/__init__.py +0 -0
app/config.py +14 -0
app/main.py +49 -0
app/manage.py +27 -0
app/middlewares/logging.py +42 -0
app/models/speech_analysis.py +20 -0
app/services/healthcheck.py +15 -0
app/services/speech_analysis.py +75 -0
app/static/index.html +287 -0
app/utils/ai_speech.py +83 -0
app/utils/common.py +85 -0
app/utils/comprehension.py +48 -0
app/utils/constants.py +15 -0
app/utils/get_scores.py +36 -0
app/utils/grammar.py +78 -0
app/utils/intonation.py +48 -0
docker-compose.yml +12 -0
requirements.txt +12 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,61 @@

+# Compiled Java class files
+*.class
+# Compiled Python bytecode
+*.py[cod]
+# Log files
+*.log
+# Package files
+*.jar
+# Maven
+target/
+dist/
+# JetBrains IDE
+.idea/
+# Unit test reports
+TEST*.xml
+# Generated by MacOS
+.DS_Store
+# Generated by Windows
+Thumbs.db
+# Applications
+*.app
+*.exe
+*.war
+# Large media files
+*.mp4
+*.tiff
+*.avi
+*.flv
+*.mov
+*.wmv
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*/__pycache__/
+*/*/__pycache__/
+*/*/*/__pycache__/
+*.py[cod]
+*/*.py[cod]
+*/*/*.py[cod]
+*/*/*/*.py[cod]
+# VSCode IDE
+.vscode/
+# Environment
+env/
+venv/
+.env
+.venv
+audio/

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+# Set the working directory
+WORKDIR /app
+# Install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Install spaCy model
+RUN python -m spacy download en_core_web_sm
+# Copy the current directory contents into the container
+COPY . .
+# Expose the port FastAPI will run on
+EXPOSE 7860
+# Command to run the application with Gunicorn and Uvicorn workers
+CMD ["uvicorn", "app.main:server", "--host", "0.0.0.0", "--port", "7860", "--workers", "4"]

__init__.py ADDED Viewed

File without changes

app/__init__.py ADDED Viewed

File without changes

app/config.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import os
+from pydantic_settings import BaseSettings
+class Settings(BaseSettings):
+    AZURE_AI_SUBSCRIPTION_KEY: str = os.getenv("AZURE_AI_SUBSCRIPTION_KEY")
+    AZURE_AI_REGION: str = os.getenv("AZURE_AI_REGION")
+    class Config:
+        env_file = ".env"
+settings = Settings()

app/main.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+import uvicorn
+from fastapi import FastAPI
+from fastapi.responses import HTMLResponse
+from fastapi.staticfiles import StaticFiles
+from app.config import settings
+from app.manage import configure_app
+from app.utils.constants import APP_DESCRIPTION, APP_NAME, APP_VERSION, DOCS_URL, OPENAPI_URL, STATIC_DIR
+def initialize_app():
+    """Initializes the app by adding required configurations using configure_app
+    Returns:
+        app: returns configured app's FastAPI instance.
+    """
+    fastapi_app = FastAPI(title=APP_NAME, version=APP_VERSION, description=APP_DESCRIPTION)
+    # app configuration - one time
+    configure_app(fastapi_app)
+    # Mount the static folder
+    fastapi_app.mount(
+        "/static",
+        StaticFiles(directory=STATIC_DIR),
+        name="static",
+    )
+    @fastapi_app.get("/")
+    async def read_index():
+        with open(os.path.join(STATIC_DIR, "index.html")) as f:
+            return HTMLResponse(content=f.read(), status_code=200)
+    return fastapi_app
+# initialize app at the end
+server = initialize_app()
+def start():
+    uvicorn.run(server, host="0.0.0.0", port=8000)
+if __name__ == "__main__":
+    start()

app/manage.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from fastapi.middleware.cors import CORSMiddleware
+from app.middlewares.logging import RequestLoggingMiddleware
+from app.services.speech_analysis import speech_analysis_router
+from app.utils.constants import API_PREFIX
+__all__ = ["configure_app"]
+def configure_app(app):
+    """Configure and return Fast Application by adding routers and middlewares to the app"""
+    # Configure router
+    app.include_router(speech_analysis_router, prefix=API_PREFIX)
+    # configure middlewares
+    app.add_middleware(RequestLoggingMiddleware)
+    # CORS settings to allow frontend requests
+    app.add_middleware(
+        CORSMiddleware,
+        allow_origins=["*"],
+        allow_credentials=True,
+        allow_methods=["*"],
+        allow_headers=["*"],
+    )
+    return app

app/middlewares/logging.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import sys
+import uuid
+import time
+import loguru
+from starlette.middleware.base import BaseHTTPMiddleware
+logger = loguru.logger
+logger.remove()
+logger.add(
+    sys.stdout,
+    format="{time} - {level} - {file.path} - {function} - {line} - ({extra[request_id]}) :: {message}",
+    level="DEBUG",
+    backtrace=False,
+)
+class RequestLoggingMiddleware(BaseHTTPMiddleware):
+    """Middleware for logging incoming HTTP requests and their responses.
+    Attributes:
+        app: The ASGI application to which the middleware is applied.
+    Methods:
+        dispatch: Method to intercept and handle HTTP requests. It logs the start of the request, executes the
+                  subsequent middleware and the ASGI application, logs any exceptions that occur during the
+                  processing of the request, and logs the end of the request.
+    """
+    def __init__(self, app):
+        super().__init__(app)
+    async def dispatch(self, request, call_next):
+        request_id = str(uuid.uuid4())
+        with logger.contextualize(request_id=request_id):
+            logger.info("Request started")
+            try:
+                return await call_next(request)
+            except Exception as ex:
+                logger.exception(f"Request failed: {ex}")
+            finally:
+                logger.info("Request ended")

app/models/speech_analysis.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from pydantic import BaseModel
+class SpeechAnalysisResponse(BaseModel):
+    fluency_score: float
+    pronunciation_score: float
+    completeness_score: float
+    accuracy_score: float
+    grammar_score: float
+    comprehension_score: float
+    intonation_score: float
+    pronunciation_feedback: str
+    fluency_feedback: str
+    accuracy_feedback: str
+    grammar_feedback: str
+    intonation_feedback: str
+    comprehension_feedback: str
+    grammar_errors: list
+    mispronunced_words: list
+    display_text: str

app/services/healthcheck.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from fastapi import APIRouter, Response
+from app.utils.constants import API_READINESS, API_LIVENESS
+router = APIRouter()
+@router.get(API_READINESS, include_in_schema=False)
+async def readiness_controller(response: Response):
+    response.status_code = 200
+@router.get(API_LIVENESS, include_in_schema=False)
+async def liveness_controller(response: Response):
+    response.status_code = 200

app/services/speech_analysis.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import os
+import shutil
+import time
+from concurrent.futures import ThreadPoolExecutor
+from fastapi import APIRouter, FastAPI, File, Form, UploadFile
+from fastapi.responses import HTMLResponse
+from pydub import AudioSegment
+from app.utils.ai_speech import pronunciation_assessment
+from app.utils.common import aggregate_scores, remove_files, split_audio
+from app.utils.get_scores import get_content_score
+from app.models.speech_analysis import SpeechAnalysisResponse
+from app.utils.constants import UPLOAD_DIR
+from loguru import logger
+speech_analysis_router = APIRouter()
+@speech_analysis_router.post("/analyze", response_model=SpeechAnalysisResponse)
+async def analyze_speech(audio_file: UploadFile = File(...), language: str = Form(...)):
+    # Make audio directory if not available
+    os.makedirs(UPLOAD_DIR, exist_ok=True)
+    # Save the uploaded file
+    s_time = time.time()
+    audio_file_path = os.path.join(UPLOAD_DIR, audio_file.filename)
+    with open(audio_file_path, "wb") as buffer:
+        shutil.copyfileobj(audio_file.file, buffer)
+    if audio_file_path.endswith("mp3"):
+        audio_file_path = convert_mp3_to_wav(audio_file_path, audio_file_path.replace("mp3", "wav"))
+    # Split audio if longer than 30 seconds
+    audio_length_ms = AudioSegment.from_wav(audio_file_path).duration_seconds * 1000
+    if audio_length_ms > 30000:
+        chunk_paths = split_audio(audio_file_path)
+    else:
+        chunk_paths = [audio_file_path]  # No splitting needed
+    # Process each chunk for azure_speech_score in parallel
+    with ThreadPoolExecutor() as executor:
+        content_future = executor.submit(get_content_score, audio_file_path)
+        pronunciation_futures = [executor.submit(pronunciation_assessment, path, language) for path in chunk_paths]
+        content_score = content_future.result()
+        chunk_scores = [future.result() for future in pronunciation_futures]
+    # Aggregate azure_speech_scores across chunks
+    final_azure_speech_score = aggregate_scores(chunk_scores)
+    # Combine final results
+    final_azure_speech_score.update(
+        {
+            "grammar_score": content_score.get("grammar_score"),
+            "intonation_score": content_score.get("intonation_score"),
+            "comprehension_score": content_score.get("comprehension_score"),
+            "grammar_errors": content_score.get("grammar_errors"),
+            "pronunciation_feedback": "Demo Content",
+            "fluency_feedback": "Demo Content",
+            "accuracy_feedback": "Demo Content",
+            "grammar_feedback": "Demo Content",
+            "intonation_feedback": "Demo Content",
+            "comprehension_feedback": "Demo Content",
+        }
+    )
+    e_time = time.time()
+    total_time = e_time - s_time
+    logger.info(str(total_time))
+    chunk_paths.append(audio_file_path)
+    remove_files(chunk_paths)
+    return final_azure_speech_score

app/static/index.html ADDED Viewed

	@@ -0,0 +1,287 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Speech Analysis</title>
+    <!-- Bootstrap CSS -->
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css" rel="stylesheet">
+    <style>
+        body {
+            background-color: #f4f7f8;
+            font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
+        }
+        .container {
+            margin-top: 50px;
+            padding: 20px;
+            background: #fff;
+            box-shadow: 0 0 15px rgba(0, 0, 0, 0.1);
+            border-radius: 10px;
+        }
+        .progress-bar-inner {
+            width: 0%;
+            transition: width 1s ease-in-out;
+        }
+        .audio-player {
+            margin-top: 20px;
+        }
+        .btn-analyze {
+            background-color: #007bff;
+            color: white;
+            border-radius: 5px;
+            padding: 10px 20px;
+        }
+        .btn-analyze:hover {
+            background-color: #0056b3;
+        }
+        .score-label {
+            display: flex;
+            justify-content: space-between;
+            font-weight: 600;
+        }
+        .feedback-section {
+            margin-top: 30px;
+        }
+        .highlight-mispronounced {
+            background-color: yellow;
+            font-weight: bold;
+        }
+        .highlight {
+            background-color: yellow;
+            font-weight: bold;
+        }
+        .highlight-grammar {
+            background-color: lightpink;
+            /* or any other color */
+            font-weight: bold;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1 class="text-center mb-4">Speech Analysis</h1>
+        <div class="mb-3">
+            <label for="audio-file" class="form-label">Upload your audio file:</label>
+            <input type="file" class="form-control" id="audio-file" accept="audio/*">
+        </div>
+        <!-- Language Dropdown -->
+        <div class="mb-3">
+            <label for="language-select" class="form-label">Select Language:</label>
+            <select class="form-select" id="language-select">
+                <option value="en-GB">English (United Kingdom)</option>
+                <option value="nb-NO">Norwegian</option>
+            </select>
+        </div>
+        <div class="text-center mb-4">
+            <button class="btn btn-analyze" onclick="analyzeAudio()">Analyze Speech</button>
+        </div>
+        <div class="audio-player text-center" id="audio-player-container" style="display: none;">
+            <audio id="audio-player" controls></audio>
+        </div>
+        <h3 class="mt-5">Speech Scores</h3>
+        <!-- Scores with Progress Bars -->
+        <div id="scores-container">
+            <div class="mb-3">
+                <div class="score-label">
+                    <span>Fluency Score</span>
+                    <span id="fluency-score">0%</span>
+                </div>
+                <div class="progress">
+                    <div class="progress-bar progress-bar-inner bg-success" id="fluency-progress" role="progressbar"
+                        aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
+                </div>
+            </div>
+            <div class="mb-3">
+                <div class="score-label">
+                    <span>Pronunciation Score</span>
+                    <span id="pronunciation-score">0%</span>
+                </div>
+                <div class="progress">
+                    <div class="progress-bar progress-bar-inner bg-primary" id="pronunciation-progress"
+                        role="progressbar" aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
+                </div>
+            </div>
+            <div class="mb-3">
+                <div class="score-label">
+                    <span>Completeness Score</span>
+                    <span id="completeness-score">0%</span>
+                </div>
+                <div class="progress">
+                    <div class="progress-bar progress-bar-inner bg-warning" id="completeness-progress"
+                        role="progressbar" aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
+                </div>
+            </div>
+            <div class="mb-3">
+                <div class="score-label">
+                    <span>Accuracy Score</span>
+                    <span id="accuracy-score">0%</span>
+                </div>
+                <div class="progress">
+                    <div class="progress-bar progress-bar-inner bg-danger" id="accuracy-progress" role="progressbar"
+                        aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
+                </div>
+            </div>
+            <div class="mb-3">
+                <div class="score-label">
+                    <span>Grammar Score</span>
+                    <span id="grammar-score">0%</span>
+                </div>
+                <div class="progress">
+                    <div class="progress-bar progress-bar-inner bg-info" id="grammar-progress" role="progressbar"
+                        aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
+                </div>
+            </div>
+            <div class="mb-3">
+                <div class="score-label">
+                    <span>Comprehension Score</span>
+                    <span id="comprehension-score">0%</span>
+                </div>
+                <div class="progress">
+                    <div class="progress-bar progress-bar-inner bg-dark" id="comprehension-progress" role="progressbar"
+                        aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
+                </div>
+            </div>
+            <div class="mb-3">
+                <div class="score-label">
+                    <span>Intonation Score</span>
+                    <span id="intonation-score">0%</span>
+                </div>
+                <div class="progress">
+                    <div class="progress-bar progress-bar-inner bg-secondary" id="intonation-progress"
+                        role="progressbar" aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
+                </div>
+            </div>
+        </div>
+        <div class="feedback-section">
+            <h4>Speech Analysis Feedback</h4>
+            <p><strong>Identified Text:</strong> <span id="identified-text"></span></p>
+            <div id="feedback-container">
+                <!-- <p><strong>Pronunciation Feedback:</strong> <span id="pronunciation-feedback"></span></p> -->
+                <!-- <p><strong>Fluency Feedback:</strong> <span id="fluency-feedback"></span></p> -->
+                <!-- <p><strong>Accuracy Feedback:</strong> <span id="accuracy-feedback"></span></p> -->
+                <!-- <p><strong>Grammar Feedback:</strong> <span id="grammar-feedback"></span></p> -->
+                <!-- <p><strong>Intonation Feedback:</strong> <span id="intonation-feedback"></span></p>
+                <p><strong>Comprehension Feedback:</strong> <span id="comprehension-feedback"></span></p> -->
+            </div>
+        </div>
+    </div>
+    <!-- Bootstrap JS and Popper.js -->
+    <script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.11.7/dist/umd/popper.min.js"></script>
+    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/js/bootstrap.min.js"></script>
+    <script>
+        function analyzeAudio() {
+            const audioFile = document.getElementById("audio-file").files[0];
+            const language = document.getElementById("language-select").value;
+            if (!audioFile) {
+                alert("Please upload an audio file.");
+                return;
+            }
+            const formData = new FormData();
+            formData.append("audio_file", audioFile);
+            formData.append("language", language)
+            fetch("http://127.0.0.1:7860/api/v1/analyze", {
+                method: "POST",
+                body: formData
+            })
+                .then(response => response.json())
+                .then(data => {
+                    updateProgress('fluency', data.fluency_score);
+                    updateProgress('pronunciation', data.pronunciation_score);
+                    updateProgress('completeness', data.completeness_score);
+                    updateProgress('accuracy', data.accuracy_score);
+                    updateProgress('grammar', data.grammar_score);
+                    updateProgress('comprehension', data.comprehension_score);
+                    updateProgress('intonation', data.intonation_score);
+                    const audioPlayer = document.getElementById("audio-player");
+                    const audioURL = URL.createObjectURL(audioFile);
+                    audioPlayer.src = audioURL;
+                    document.getElementById("audio-player-container").style.display = "block";
+                    // Example data to simulate the transcript and errors from the backend
+                    // const grammar_errors = [{ 'word': 'dismissal', 'position_in_text': 2, 'error': 'Subject without a verb', 'suggestion': 'Ensure the subject is followed by a verb.' }, { 'word': 'college', 'position_in_text': 4, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'he', 'position_in_text': 10, 'error': 'Subject without a verb', 'suggestion': 'Ensure the subject is followed by a verb.' }, { 'word': 'reaction', 'position_in_text': 13, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'experiences', 'position_in_text': 24, 'error': 'Subject without a verb', 'suggestion': 'Ensure the subject is followed by a verb.' }, { 'word': 'experiences', 'position_in_text': 24, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'factory', 'position_in_text': 32, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'success', 'position_in_text': 38, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'leader', 'position_in_text': 42, 'error': 'Subject without a verb', 'suggestion': 'Ensure the subject is followed by a verb.' }, { 'word': 'environment', 'position_in_text': 52, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'clashes', 'position_in_text': 61, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'Illusion', 'position_in_text': 64, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'incense', 'position_in_text': 73, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'violence', 'position_in_text': 75, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'write', 'position_in_text': 77, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'which', 'position_in_text': 82, 'error': 'Subject without a verb', 'suggestion': 'Ensure the subject is followed by a verb.' }]
+                    // Function to highlight mispronounced words and grammar errors
+                    function highlightErrors(text, mispronouncedWords, grammarErrors) {
+                        const textArray = text.split(" ");
+                        const highlightedTextArray = textArray.map((word, index) => {
+                            // Check for mispronounced words
+                            const mispronounced = mispronouncedWords.find(mw => mw.word === word && mw.position_in_text === index);
+                            if (mispronounced) {
+                                return `<span class="highlight-mispronounced">${word}</span>`;
+                            }
+                            // Check for grammar errors
+                            const grammarError = grammarErrors.find(ge => ge.word === word && ge.position_in_text === index);
+                            console.log("GRAMMAR ERROR: ", grammarError)
+                            if (grammarError) {
+                                return `<span class="highlight-grammar">${word}</span>`;
+                            }
+                            return word; // Return unmodified if no errors
+                        });
+                        return highlightedTextArray.join(" ");
+                    }
+                    // Apply the highlighting function to the DisplayText
+                    const highlightedText = highlightErrors(data.display_text, data.mispronunced_words, data.grammar_errors);
+                    // Inject the highlighted text into an HTML element
+                    document.getElementById("identified-text").innerHTML = highlightedText;
+                    // Update feedback
+                    document.getElementById("pronunciation-feedback").textContent = data.pronunciation_feedback;
+                    document.getElementById("fluency-feedback").textContent = data.fluency_feedback;
+                    document.getElementById("accuracy-feedback").textContent = data.accuracy_feedback;
+                    document.getElementById("grammar-feedback").textContent = data.grammar_feedback;
+                    document.getElementById("intonation-feedback").textContent = data.intonation_feedback;
+                    document.getElementById("comprehension-feedback").textContent = data.comprehension_feedback;
+                })
+                .catch(error => console.error('Error:', error));
+        }
+        function updateProgress(scoreType, score) {
+            const progressBar = document.getElementById(`${scoreType}-progress`);
+            const scoreLabel = document.getElementById(`${scoreType}-score`);
+            progressBar.style.width = `${score}%`;
+            progressBar.setAttribute('aria-valuenow', score);
+            scoreLabel.textContent = `${score}%`;
+        }
+    </script>
+</body>
+</html>

app/utils/ai_speech.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import csv
+import json
+import os
+import time
+import azure.cognitiveservices.speech as speechsdk
+from dotenv import load_dotenv
+from pydub import AudioSegment
+from app.config import settings
+def text_details(response_json):
+    mispronunced_words = []
+    nbest = response_json.get("NBest", [])[0]  # Assumes you use the top result
+    word_list = nbest.get("Words", [])
+    for word_info in word_list:
+        if word_info["PronunciationAssessment"]["ErrorType"] == "Mispronunciation":
+            # Collect necessary details
+            mispronunced_words.append(
+                {
+                    "word": word_info["Word"],
+                    "offset": word_info["Offset"],
+                    "position_in_text": word_list.index(word_info),  # Get index for order reference
+                }
+            )
+    display_text = nbest["Display"]
+    return {"mispronunced_words": mispronunced_words, "display_text": display_text}
+def pronunciation_assessment(file_path, language):
+    if str(file_path).endswith("mp3"):
+        mp3_path = file_path
+        file_path = file_path.replace(".mp3", ".wav")
+        sound = AudioSegment.from_mp3(mp3_path)
+        sound.export(file_path, format="wav")
+    # Initialize speech config
+    speech_config = speechsdk.SpeechConfig(
+        subscription=settings.AZURE_AI_SUBSCRIPTION_KEY, region=settings.AZURE_AI_REGION
+    )
+    speech_config.speech_recognition_language = language
+    # Create pronunciation assessment config
+    pronunciation_config = speechsdk.PronunciationAssessmentConfig(
+        grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
+        granularity=speechsdk.PronunciationAssessmentGranularity.Phoneme,
+        enable_miscue=True,
+    )
+    # Initialize audio config from file
+    audio_config = speechsdk.audio.AudioConfig(filename=file_path)
+    # Initialize speech recognizer with the audio file
+    recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
+    # Apply pronunciation assessment configuration
+    pronunciation_config.apply_to(recognizer)
+    # Perform recognition and assessment
+    result = recognizer.recognize_once()
+    audio_text_details = text_details(json.loads(result.json))
+    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
+        # Extract pronunciation assessment results
+        pronunciation_result = speechsdk.PronunciationAssessmentResult(result)
+        # Build result dictionary
+        analysis_results = {
+            "File": file_path,
+            "pronunciation_score": pronunciation_result.pronunciation_score,
+            "accuracy_score": pronunciation_result.accuracy_score,
+            "fluency_score": pronunciation_result.fluency_score,
+            "completeness_score": pronunciation_result.completeness_score,
+            "mispronunced_words": audio_text_details.get("mispronunced_words", []),
+            "display_text": audio_text_details.get("display_text", ""),
+        }
+        return analysis_results

app/utils/common.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os
+import spacy
+import speech_recognition as sr
+from loguru import logger
+from pydub import AudioSegment
+# Load spaCy's English language model for grammar and text analysis
+def get_doc(text):
+    try:
+        nlp = spacy.load("en_core_web_sm")
+        doc = nlp(text)
+        return doc
+    except OSError as ex:
+        logger.exception("Error while getting Spacy doc: ", str(ex))
+        logger.info("Please download the model with this command: python -m spacy download en_core_web_sm")
+        os.
+# Convert MP3 to WAV using pydub
+def convert_mp3_to_wav(mp3_file, wav_file="converted_comprehension_audio.wav"):
+    sound = AudioSegment.from_mp3(mp3_file)
+    sound.export(wav_file, format="wav")
+    return wav_file
+# Convert WAV to text using SpeechRecognition
+def transcribe_audio(wav_file):
+    recognizer = sr.Recognizer()
+    with sr.AudioFile(wav_file) as source:
+        audio_data = recognizer.record(source)
+        text = recognizer.recognize_google(audio_data)
+    return text
+def split_audio(audio_path, chunk_length_ms=30000):
+    """Splits audio into chunks of specified length (in milliseconds)."""
+    audio = AudioSegment.from_wav(audio_path)
+    chunks = [audio[i : i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
+    chunk_paths = []
+    for idx, chunk in enumerate(chunks):
+        chunk_path = f"{audio_path}_chunk_{idx}.wav"
+        chunk.export(chunk_path, format="wav")
+        chunk_paths.append(chunk_path)
+    return chunk_paths
+def aggregate_scores(chunk_scores):
+    """Aggregate scores across all chunks to produce a final score."""
+    final_score = {
+        "accuracy_score": 0,
+        "fluency_score": 0,
+        "completeness_score": 0,
+        "pronunciation_score": 0,
+        "mispronunced_words": [],
+        "display_text": "",
+    }
+    num_chunks = len(chunk_scores)
+    # Sum each score across chunks
+    for score in chunk_scores:
+        final_score["accuracy_score"] += score.get("accuracy_score", 0)
+        final_score["fluency_score"] += score.get("fluency_score", 0)
+        final_score["completeness_score"] += score.get("completeness_score", 0)
+        final_score["pronunciation_score"] += score.get("pronunciation_score", 0)
+        final_score["mispronunced_words"].extend(score.get("mispronunced_words"))
+        final_score["display_text"] += score.get("display_text")
+    # Average each score
+    for key in final_score:
+        if type(final_score[key]) in [float, int]:
+            final_score[key] = round(final_score[key] / num_chunks, 2)
+    return final_score
+def remove_files(file_list):
+    try:
+        os.system("rm " + " ".join(file_list))
+        logger.info("All listed files removed.")
+    except Exception as ex:
+        logger.exception(f"Error removing files: {ex}")

app/utils/comprehension.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from collections import Counter
+import spacy
+import speech_recognition as sr
+from pydub import AudioSegment
+# Comprehension Score Evaluation (No Reference Text)
+def evaluate_comprehension(text, doc):
+    # 1. Listening Comprehension: Evaluate clarity and coherence of the sentences
+    sentence_count = len(list(doc.sents))
+    sentence_length_avg = sum([len(sent) for sent in doc.sents]) / sentence_count if sentence_count > 0 else 0
+    if sentence_length_avg <= 10:  # Short and simple sentences
+        listening_comprehension_score = 10
+    elif 10 < sentence_length_avg <= 20:  # Moderate length sentences
+        listening_comprehension_score = 8
+    else:  # Longer sentences may indicate less clarity
+        listening_comprehension_score = 6
+    # 2. Topic Relevance: Lexical diversity and internal consistency
+    word_list = [token.text.lower() for token in doc if token.is_alpha]
+    word_count = len(word_list)
+    unique_words = len(set(word_list))
+    if word_count == 0:
+        topic_relevance_score = 0
+    else:
+        lexical_diversity = unique_words / word_count
+        if lexical_diversity > 0.7:  # High diversity, likely more relevant
+            topic_relevance_score = 10
+        elif 0.4 <= lexical_diversity <= 0.7:  # Moderate diversity
+            topic_relevance_score = 7
+        else:  # Low diversity indicates repetition, which may indicate off-topic speech
+            topic_relevance_score = 4
+    # 3. Understanding: Based on simplicity and grammatical structure
+    understanding_score = listening_comprehension_score  # Reuse comprehension score since it's already based on clarity
+    # Final Comprehension Score: (Listening Comprehension + Topic Relevance + Understanding) / 3
+    comprehension_score = (listening_comprehension_score + topic_relevance_score + understanding_score) / 3
+    return {
+        "listening_comprehension": round(listening_comprehension_score, 2),
+        "topic_relevance": round(topic_relevance_score, 2),
+        "understanding": round(understanding_score, 2),
+        "comprehension_score": round(comprehension_score, 2),
+    }

app/utils/constants.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+APP_NAME = "Speech Analysis"
+APP_DESCRIPTION = "Speech Analysis ML Inference API"
+APP_VERSION = "v1"
+API_BASE = "/"
+API_PREFIX = "/api/" + APP_VERSION
+BASE_PATH = os.path.dirname(os.path.abspath(__file__)).split("/app")[0]
+UPLOAD_DIR = os.path.join(BASE_PATH, "audio")
+STATIC_DIR = os.path.join(BASE_PATH, "app", "static")
+API_READINESS = "/readiness"
+API_LIVENESS = "/liveness"

app/utils/get_scores.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import csv
+import os
+from concurrent.futures import ThreadPoolExecutor
+from loguru import logger
+from app.utils.common import convert_mp3_to_wav, get_doc, transcribe_audio
+from app.utils.comprehension import evaluate_comprehension
+from app.utils.grammar import evaluate_grammar
+from app.utils.intonation import evaluate_intonation
+def get_content_score(audio_file):
+    if audio_file.endswith("mp3"):
+        wav_file = convert_mp3_to_wav(audio_file)
+    else:
+        wav_file = audio_file
+    # Step 2: Transcribe the WAV to text
+    transcript = transcribe_audio(wav_file)
+    doc = get_doc(transcript)
+    with ThreadPoolExecutor() as executor:
+        futures = [
+            executor.submit(evaluate_intonation, wav_file),
+            executor.submit(evaluate_grammar, transcript, doc),
+            executor.submit(evaluate_comprehension, transcript, doc),
+        ]
+    return {
+        "intonation_score": round((float(futures[0].result().get("intonation_score")) * 10), 2),
+        "grammar_score": round((float(futures[1].result().get("grammar_score")) * 10), 2),
+        "grammar_errors": futures[1].result().get("errors"),
+        "comprehension_score": round((float(futures[2].result().get("comprehension_score")) * 10), 2),
+    }

app/utils/grammar.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import spacy
+import speech_recognition as sr
+from pydub import AudioSegment
+# Enhanced Sentence Structure Scoring
+def evaluate_sentence_structure(doc):
+    sentence_structure_score = 0
+    sentence_count = len(list(doc.sents))
+    if sentence_count == 0:
+        return 0
+    for sentence in doc.sents:
+        # Check sentence length (moderate-length sentences should score higher)
+        sentence_length = len(sentence)
+        if 10 <= sentence_length <= 20:
+            sentence_length_score = 10
+        elif 20 < sentence_length <= 30:
+            sentence_length_score = 8
+        else:
+            sentence_length_score = 5 if sentence_length > 30 else 2
+        # Syntactic complexity: Count unique dependency types in the sentence
+        unique_dependencies = len(set([token.dep_ for token in sentence if token.dep_ != "punct"]))
+        if unique_dependencies > 6:  # Complex sentence
+            syntax_complexity_score = 10
+        elif 3 <= unique_dependencies <= 6:  # Moderately complex
+            syntax_complexity_score = 7
+        else:
+            syntax_complexity_score = 4  # Simple sentence
+        # Average the length and syntax scores
+        sentence_score = (sentence_length_score + syntax_complexity_score) / 2
+        sentence_structure_score += sentence_score
+    # Final average score for all sentences in the document
+    return min((sentence_structure_score / sentence_count), 10)
+# Grammar scoring based on Sentence Structure, Grammar Usage, and Vocabulary Range
+def evaluate_grammar(text, doc):
+    # 1. Sentence Structure
+    sentence_structure_score = evaluate_sentence_structure(doc)
+    errors = []
+    # 2. Grammar Usage: Use the same logic for now
+    # Example:
+    # - The boy hungry.  # Wrong
+    # - The boy is hungry # Correct
+    grammar_usage_score = 10  # Assume perfect grammar initially
+    for token in doc:
+        if token.dep_ == "nsubj" and token.head.pos_ != "VERB":
+            errors.append(
+                {
+                    "word": token.text,
+                    "position_in_text": token.i,
+                    "error": "Subject without a verb",
+                    "suggestion": "Ensure the subject is followed by a verb.",
+                }
+            )
+            grammar_usage_score -= 2  # Deduct points for common grammar mistakes
+    # 3. Vocabulary Range
+    unique_words = set([token.text.lower() for token in doc if token.is_alpha])  # Get Unique words in lower case
+    vocabulary_range_score = min(len(unique_words) / len(doc), 1) * 10  # Lexical diversity
+    # Final Grammar Score using the formula: (Sentence Structure + Grammar Usage + Vocabulary Range) / 3
+    grammar_score = (sentence_structure_score + grammar_usage_score + vocabulary_range_score) / 3
+    return {
+        "sentence_structure": round(sentence_structure_score, 2),
+        "grammar_usage": round(grammar_usage_score, 2),
+        "vocabulary_range": round(vocabulary_range_score, 2),
+        "grammar_score": round(grammar_score, 2),
+        "errors": errors,
+        "text": text,
+    }

app/utils/intonation.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import librosa
+import numpy as np
+from pydub import AudioSegment
+# Feature extraction for Intonation
+def evaluate_intonation(wav_file):
+    # Load audio using librosa
+    y, sr = librosa.load(wav_file)
+    # 1. Sentence Stress (based on energy)
+    # Calculate Root mean square energy which represents the perceived loudness or power of the audio, which can be linked to sentence stress
+    rms_energy = librosa.feature.rms(y=y)[0]
+    avg_energy = np.mean(rms_energy) * 10  # Scale up for score calculation
+    avg_energy = float(avg_energy)  # Ensure scalar
+    # 2. Intonation Patterns (based on pitch variation)
+    # Estimate the pitch (fundamental frequency) of the audio signal over time.
+    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
+    pitch_values = pitches[magnitudes > np.median(magnitudes)]
+    if len(pitch_values) > 0:
+        pitch_variation = np.std(pitch_values) / np.mean(pitch_values) * 10
+    else:
+        pitch_variation = 0
+    pitch_variation = float(pitch_variation)  # Ensure scalar
+    # 3. Rhythm (based on tempo)
+    # Estimates the tempo of the audio in beats per minute (BPM) by tracking the rhythmic structure (detecting beats in the signal).
+    # Average speaking tempo is ~120 BPM
+    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
+    rhythm_score = (tempo / 120) * 10  # Adjust based on average speaking tempo (120 BPM)
+    rhythm_score = float(rhythm_score)  # Ensure scalar
+    # Average score as per the formula: (Sentence Stress + Intonation Patterns + Rhythm) / 3
+    # Normalize the scores before calculating the final score
+    sentence_stress_score = min(max(avg_energy, 0), 10)  # Ensure it's within [0, 10]
+    intonation_patterns_score = min(max(pitch_variation, 0), 10)  # Ensure it's within [0, 10]
+    rhythm_score = min(max(rhythm_score, 0), 10)  # Ensure it's within [0, 10]
+    # Intonation score calculation
+    intonation_score = (sentence_stress_score + intonation_patterns_score + rhythm_score) / 3
+    return {
+        "sentence_stress": sentence_stress_score,
+        "intonation_patterns": intonation_patterns_score,
+        "rhythm": rhythm_score,
+        "intonation_score": intonation_score,
+    }

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,12 @@

+version: "3.8"
+services:
+  web:
+    build: .
+    command: "uvicorn app.main:server --host 0.0.0.0 --port 7860"
+    volumes:
+      - .:/app
+    ports:
+      - "7860:7860"
+    env_file:
+      - .env

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+fastapi
+uvicorn
+pydub
+spacy
+SpeechRecognition==3.10.4
+loguru
+numpy
+librosa
+azure-cognitiveservices-speech
+python-dotenv
+pydantic-settings
+python-multipart