jaykishan-b commited on
Commit
79b7942
·
1 Parent(s): ddddf1b
.gitignore ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Compiled Java class files
2
+ *.class
3
+
4
+ # Compiled Python bytecode
5
+ *.py[cod]
6
+
7
+ # Log files
8
+ *.log
9
+
10
+ # Package files
11
+ *.jar
12
+
13
+ # Maven
14
+ target/
15
+ dist/
16
+
17
+ # JetBrains IDE
18
+ .idea/
19
+
20
+ # Unit test reports
21
+ TEST*.xml
22
+
23
+ # Generated by MacOS
24
+ .DS_Store
25
+
26
+ # Generated by Windows
27
+ Thumbs.db
28
+
29
+ # Applications
30
+ *.app
31
+ *.exe
32
+ *.war
33
+
34
+ # Large media files
35
+ *.mp4
36
+ *.tiff
37
+ *.avi
38
+ *.flv
39
+ *.mov
40
+ *.wmv
41
+
42
+ # Byte-compiled / optimized / DLL files
43
+ __pycache__/
44
+ */__pycache__/
45
+ */*/__pycache__/
46
+ */*/*/__pycache__/
47
+ *.py[cod]
48
+ */*.py[cod]
49
+ */*/*.py[cod]
50
+ */*/*/*.py[cod]
51
+
52
+ # VSCode IDE
53
+ .vscode/
54
+
55
+ # Environment
56
+ env/
57
+ venv/
58
+ .env
59
+ .venv
60
+
61
+ audio/
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.9-slim
3
+
4
+ # Set the working directory
5
+ WORKDIR /app
6
+
7
+ # Install dependencies
8
+ COPY requirements.txt .
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+
11
+ # Install spaCy model
12
+ RUN python -m spacy download en_core_web_sm
13
+
14
+ # Copy the current directory contents into the container
15
+ COPY . .
16
+
17
+ # Expose the port FastAPI will run on
18
+ EXPOSE 7860
19
+
20
+ # Command to run the application with Gunicorn and Uvicorn workers
21
+ CMD ["uvicorn", "app.main:server", "--host", "0.0.0.0", "--port", "7860", "--workers", "4"]
__init__.py ADDED
File without changes
app/__init__.py ADDED
File without changes
app/config.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from pydantic_settings import BaseSettings
4
+
5
+
6
+ class Settings(BaseSettings):
7
+ AZURE_AI_SUBSCRIPTION_KEY: str = os.getenv("AZURE_AI_SUBSCRIPTION_KEY")
8
+ AZURE_AI_REGION: str = os.getenv("AZURE_AI_REGION")
9
+
10
+ class Config:
11
+ env_file = ".env"
12
+
13
+
14
+ settings = Settings()
app/main.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import uvicorn
4
+ from fastapi import FastAPI
5
+ from fastapi.responses import HTMLResponse
6
+ from fastapi.staticfiles import StaticFiles
7
+
8
+ from app.config import settings
9
+ from app.manage import configure_app
10
+ from app.utils.constants import APP_DESCRIPTION, APP_NAME, APP_VERSION, DOCS_URL, OPENAPI_URL, STATIC_DIR
11
+
12
+
13
+ def initialize_app():
14
+ """Initializes the app by adding required configurations using configure_app
15
+
16
+ Returns:
17
+ app: returns configured app's FastAPI instance.
18
+ """
19
+
20
+ fastapi_app = FastAPI(title=APP_NAME, version=APP_VERSION, description=APP_DESCRIPTION)
21
+
22
+ # app configuration - one time
23
+ configure_app(fastapi_app)
24
+
25
+ # Mount the static folder
26
+ fastapi_app.mount(
27
+ "/static",
28
+ StaticFiles(directory=STATIC_DIR),
29
+ name="static",
30
+ )
31
+
32
+ @fastapi_app.get("/")
33
+ async def read_index():
34
+ with open(os.path.join(STATIC_DIR, "index.html")) as f:
35
+ return HTMLResponse(content=f.read(), status_code=200)
36
+
37
+ return fastapi_app
38
+
39
+
40
+ # initialize app at the end
41
+ server = initialize_app()
42
+
43
+
44
+ def start():
45
+ uvicorn.run(server, host="0.0.0.0", port=8000)
46
+
47
+
48
+ if __name__ == "__main__":
49
+ start()
app/manage.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi.middleware.cors import CORSMiddleware
2
+
3
+ from app.middlewares.logging import RequestLoggingMiddleware
4
+ from app.services.speech_analysis import speech_analysis_router
5
+ from app.utils.constants import API_PREFIX
6
+
7
+ __all__ = ["configure_app"]
8
+
9
+
10
+ def configure_app(app):
11
+ """Configure and return Fast Application by adding routers and middlewares to the app"""
12
+ # Configure router
13
+ app.include_router(speech_analysis_router, prefix=API_PREFIX)
14
+
15
+ # configure middlewares
16
+ app.add_middleware(RequestLoggingMiddleware)
17
+
18
+ # CORS settings to allow frontend requests
19
+ app.add_middleware(
20
+ CORSMiddleware,
21
+ allow_origins=["*"],
22
+ allow_credentials=True,
23
+ allow_methods=["*"],
24
+ allow_headers=["*"],
25
+ )
26
+
27
+ return app
app/middlewares/logging.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import uuid
3
+ import time
4
+
5
+ import loguru
6
+ from starlette.middleware.base import BaseHTTPMiddleware
7
+
8
+ logger = loguru.logger
9
+ logger.remove()
10
+ logger.add(
11
+ sys.stdout,
12
+ format="{time} - {level} - {file.path} - {function} - {line} - ({extra[request_id]}) :: {message}",
13
+ level="DEBUG",
14
+ backtrace=False,
15
+ )
16
+
17
+
18
+ class RequestLoggingMiddleware(BaseHTTPMiddleware):
19
+ """Middleware for logging incoming HTTP requests and their responses.
20
+
21
+ Attributes:
22
+ app: The ASGI application to which the middleware is applied.
23
+
24
+ Methods:
25
+ dispatch: Method to intercept and handle HTTP requests. It logs the start of the request, executes the
26
+ subsequent middleware and the ASGI application, logs any exceptions that occur during the
27
+ processing of the request, and logs the end of the request.
28
+ """
29
+
30
+ def __init__(self, app):
31
+ super().__init__(app)
32
+
33
+ async def dispatch(self, request, call_next):
34
+ request_id = str(uuid.uuid4())
35
+ with logger.contextualize(request_id=request_id):
36
+ logger.info("Request started")
37
+ try:
38
+ return await call_next(request)
39
+ except Exception as ex:
40
+ logger.exception(f"Request failed: {ex}")
41
+ finally:
42
+ logger.info("Request ended")
app/models/speech_analysis.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel
2
+
3
+
4
+ class SpeechAnalysisResponse(BaseModel):
5
+ fluency_score: float
6
+ pronunciation_score: float
7
+ completeness_score: float
8
+ accuracy_score: float
9
+ grammar_score: float
10
+ comprehension_score: float
11
+ intonation_score: float
12
+ pronunciation_feedback: str
13
+ fluency_feedback: str
14
+ accuracy_feedback: str
15
+ grammar_feedback: str
16
+ intonation_feedback: str
17
+ comprehension_feedback: str
18
+ grammar_errors: list
19
+ mispronunced_words: list
20
+ display_text: str
app/services/healthcheck.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Response
2
+
3
+ from app.utils.constants import API_READINESS, API_LIVENESS
4
+
5
+ router = APIRouter()
6
+
7
+
8
+ @router.get(API_READINESS, include_in_schema=False)
9
+ async def readiness_controller(response: Response):
10
+ response.status_code = 200
11
+
12
+
13
+ @router.get(API_LIVENESS, include_in_schema=False)
14
+ async def liveness_controller(response: Response):
15
+ response.status_code = 200
app/services/speech_analysis.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import time
4
+ from concurrent.futures import ThreadPoolExecutor
5
+
6
+ from fastapi import APIRouter, FastAPI, File, Form, UploadFile
7
+ from fastapi.responses import HTMLResponse
8
+ from pydub import AudioSegment
9
+
10
+ from app.utils.ai_speech import pronunciation_assessment
11
+ from app.utils.common import aggregate_scores, remove_files, split_audio
12
+ from app.utils.get_scores import get_content_score
13
+ from app.models.speech_analysis import SpeechAnalysisResponse
14
+ from app.utils.constants import UPLOAD_DIR
15
+ from loguru import logger
16
+
17
+ speech_analysis_router = APIRouter()
18
+
19
+
20
+ @speech_analysis_router.post("/analyze", response_model=SpeechAnalysisResponse)
21
+ async def analyze_speech(audio_file: UploadFile = File(...), language: str = Form(...)):
22
+ # Make audio directory if not available
23
+ os.makedirs(UPLOAD_DIR, exist_ok=True)
24
+
25
+ # Save the uploaded file
26
+ s_time = time.time()
27
+ audio_file_path = os.path.join(UPLOAD_DIR, audio_file.filename)
28
+
29
+ with open(audio_file_path, "wb") as buffer:
30
+ shutil.copyfileobj(audio_file.file, buffer)
31
+
32
+ if audio_file_path.endswith("mp3"):
33
+ audio_file_path = convert_mp3_to_wav(audio_file_path, audio_file_path.replace("mp3", "wav"))
34
+
35
+ # Split audio if longer than 30 seconds
36
+ audio_length_ms = AudioSegment.from_wav(audio_file_path).duration_seconds * 1000
37
+ if audio_length_ms > 30000:
38
+ chunk_paths = split_audio(audio_file_path)
39
+ else:
40
+ chunk_paths = [audio_file_path] # No splitting needed
41
+
42
+ # Process each chunk for azure_speech_score in parallel
43
+ with ThreadPoolExecutor() as executor:
44
+ content_future = executor.submit(get_content_score, audio_file_path)
45
+ pronunciation_futures = [executor.submit(pronunciation_assessment, path, language) for path in chunk_paths]
46
+ content_score = content_future.result()
47
+ chunk_scores = [future.result() for future in pronunciation_futures]
48
+
49
+ # Aggregate azure_speech_scores across chunks
50
+ final_azure_speech_score = aggregate_scores(chunk_scores)
51
+
52
+ # Combine final results
53
+ final_azure_speech_score.update(
54
+ {
55
+ "grammar_score": content_score.get("grammar_score"),
56
+ "intonation_score": content_score.get("intonation_score"),
57
+ "comprehension_score": content_score.get("comprehension_score"),
58
+ "grammar_errors": content_score.get("grammar_errors"),
59
+ "pronunciation_feedback": "Demo Content",
60
+ "fluency_feedback": "Demo Content",
61
+ "accuracy_feedback": "Demo Content",
62
+ "grammar_feedback": "Demo Content",
63
+ "intonation_feedback": "Demo Content",
64
+ "comprehension_feedback": "Demo Content",
65
+ }
66
+ )
67
+
68
+ e_time = time.time()
69
+ total_time = e_time - s_time
70
+ logger.info(str(total_time))
71
+
72
+ chunk_paths.append(audio_file_path)
73
+ remove_files(chunk_paths)
74
+
75
+ return final_azure_speech_score
app/static/index.html ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+ <title>Speech Analysis</title>
8
+ <!-- Bootstrap CSS -->
9
+ <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css" rel="stylesheet">
10
+ <style>
11
+ body {
12
+ background-color: #f4f7f8;
13
+ font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
14
+ }
15
+
16
+ .container {
17
+ margin-top: 50px;
18
+ padding: 20px;
19
+ background: #fff;
20
+ box-shadow: 0 0 15px rgba(0, 0, 0, 0.1);
21
+ border-radius: 10px;
22
+ }
23
+
24
+ .progress-bar-inner {
25
+ width: 0%;
26
+ transition: width 1s ease-in-out;
27
+ }
28
+
29
+ .audio-player {
30
+ margin-top: 20px;
31
+ }
32
+
33
+ .btn-analyze {
34
+ background-color: #007bff;
35
+ color: white;
36
+ border-radius: 5px;
37
+ padding: 10px 20px;
38
+ }
39
+
40
+ .btn-analyze:hover {
41
+ background-color: #0056b3;
42
+ }
43
+
44
+ .score-label {
45
+ display: flex;
46
+ justify-content: space-between;
47
+ font-weight: 600;
48
+ }
49
+
50
+ .feedback-section {
51
+ margin-top: 30px;
52
+ }
53
+
54
+ .highlight-mispronounced {
55
+ background-color: yellow;
56
+ font-weight: bold;
57
+ }
58
+
59
+ .highlight {
60
+ background-color: yellow;
61
+ font-weight: bold;
62
+ }
63
+
64
+ .highlight-grammar {
65
+ background-color: lightpink;
66
+ /* or any other color */
67
+ font-weight: bold;
68
+ }
69
+ </style>
70
+ </head>
71
+
72
+ <body>
73
+
74
+ <div class="container">
75
+ <h1 class="text-center mb-4">Speech Analysis</h1>
76
+
77
+ <div class="mb-3">
78
+ <label for="audio-file" class="form-label">Upload your audio file:</label>
79
+ <input type="file" class="form-control" id="audio-file" accept="audio/*">
80
+ </div>
81
+
82
+ <!-- Language Dropdown -->
83
+ <div class="mb-3">
84
+ <label for="language-select" class="form-label">Select Language:</label>
85
+ <select class="form-select" id="language-select">
86
+ <option value="en-GB">English (United Kingdom)</option>
87
+ <option value="nb-NO">Norwegian</option>
88
+ </select>
89
+ </div>
90
+
91
+ <div class="text-center mb-4">
92
+ <button class="btn btn-analyze" onclick="analyzeAudio()">Analyze Speech</button>
93
+ </div>
94
+
95
+ <div class="audio-player text-center" id="audio-player-container" style="display: none;">
96
+ <audio id="audio-player" controls></audio>
97
+ </div>
98
+
99
+ <h3 class="mt-5">Speech Scores</h3>
100
+
101
+ <!-- Scores with Progress Bars -->
102
+ <div id="scores-container">
103
+ <div class="mb-3">
104
+ <div class="score-label">
105
+ <span>Fluency Score</span>
106
+ <span id="fluency-score">0%</span>
107
+ </div>
108
+ <div class="progress">
109
+ <div class="progress-bar progress-bar-inner bg-success" id="fluency-progress" role="progressbar"
110
+ aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
111
+ </div>
112
+ </div>
113
+
114
+ <div class="mb-3">
115
+ <div class="score-label">
116
+ <span>Pronunciation Score</span>
117
+ <span id="pronunciation-score">0%</span>
118
+ </div>
119
+ <div class="progress">
120
+ <div class="progress-bar progress-bar-inner bg-primary" id="pronunciation-progress"
121
+ role="progressbar" aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
122
+ </div>
123
+ </div>
124
+
125
+ <div class="mb-3">
126
+ <div class="score-label">
127
+ <span>Completeness Score</span>
128
+ <span id="completeness-score">0%</span>
129
+ </div>
130
+ <div class="progress">
131
+ <div class="progress-bar progress-bar-inner bg-warning" id="completeness-progress"
132
+ role="progressbar" aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
133
+ </div>
134
+ </div>
135
+
136
+ <div class="mb-3">
137
+ <div class="score-label">
138
+ <span>Accuracy Score</span>
139
+ <span id="accuracy-score">0%</span>
140
+ </div>
141
+ <div class="progress">
142
+ <div class="progress-bar progress-bar-inner bg-danger" id="accuracy-progress" role="progressbar"
143
+ aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
144
+ </div>
145
+ </div>
146
+
147
+ <div class="mb-3">
148
+ <div class="score-label">
149
+ <span>Grammar Score</span>
150
+ <span id="grammar-score">0%</span>
151
+ </div>
152
+ <div class="progress">
153
+ <div class="progress-bar progress-bar-inner bg-info" id="grammar-progress" role="progressbar"
154
+ aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
155
+ </div>
156
+ </div>
157
+
158
+ <div class="mb-3">
159
+ <div class="score-label">
160
+ <span>Comprehension Score</span>
161
+ <span id="comprehension-score">0%</span>
162
+ </div>
163
+ <div class="progress">
164
+ <div class="progress-bar progress-bar-inner bg-dark" id="comprehension-progress" role="progressbar"
165
+ aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
166
+ </div>
167
+ </div>
168
+
169
+ <div class="mb-3">
170
+ <div class="score-label">
171
+ <span>Intonation Score</span>
172
+ <span id="intonation-score">0%</span>
173
+ </div>
174
+ <div class="progress">
175
+ <div class="progress-bar progress-bar-inner bg-secondary" id="intonation-progress"
176
+ role="progressbar" aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
177
+ </div>
178
+ </div>
179
+ </div>
180
+
181
+ <div class="feedback-section">
182
+ <h4>Speech Analysis Feedback</h4>
183
+ <p><strong>Identified Text:</strong> <span id="identified-text"></span></p>
184
+
185
+ <div id="feedback-container">
186
+ <!-- <p><strong>Pronunciation Feedback:</strong> <span id="pronunciation-feedback"></span></p> -->
187
+ <!-- <p><strong>Fluency Feedback:</strong> <span id="fluency-feedback"></span></p> -->
188
+ <!-- <p><strong>Accuracy Feedback:</strong> <span id="accuracy-feedback"></span></p> -->
189
+ <!-- <p><strong>Grammar Feedback:</strong> <span id="grammar-feedback"></span></p> -->
190
+ <!-- <p><strong>Intonation Feedback:</strong> <span id="intonation-feedback"></span></p>
191
+ <p><strong>Comprehension Feedback:</strong> <span id="comprehension-feedback"></span></p> -->
192
+ </div>
193
+ </div>
194
+ </div>
195
+
196
+ <!-- Bootstrap JS and Popper.js -->
197
+ <script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.11.7/dist/umd/popper.min.js"></script>
198
+ <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/js/bootstrap.min.js"></script>
199
+
200
+ <script>
201
+ function analyzeAudio() {
202
+ const audioFile = document.getElementById("audio-file").files[0];
203
+ const language = document.getElementById("language-select").value;
204
+
205
+ if (!audioFile) {
206
+ alert("Please upload an audio file.");
207
+ return;
208
+ }
209
+
210
+ const formData = new FormData();
211
+ formData.append("audio_file", audioFile);
212
+ formData.append("language", language)
213
+
214
+ fetch("http://127.0.0.1:7860/api/v1/analyze", {
215
+ method: "POST",
216
+ body: formData
217
+ })
218
+ .then(response => response.json())
219
+ .then(data => {
220
+ updateProgress('fluency', data.fluency_score);
221
+ updateProgress('pronunciation', data.pronunciation_score);
222
+ updateProgress('completeness', data.completeness_score);
223
+ updateProgress('accuracy', data.accuracy_score);
224
+ updateProgress('grammar', data.grammar_score);
225
+ updateProgress('comprehension', data.comprehension_score);
226
+ updateProgress('intonation', data.intonation_score);
227
+
228
+ const audioPlayer = document.getElementById("audio-player");
229
+ const audioURL = URL.createObjectURL(audioFile);
230
+ audioPlayer.src = audioURL;
231
+ document.getElementById("audio-player-container").style.display = "block";
232
+
233
+ // Example data to simulate the transcript and errors from the backend
234
+ // const grammar_errors = [{ 'word': 'dismissal', 'position_in_text': 2, 'error': 'Subject without a verb', 'suggestion': 'Ensure the subject is followed by a verb.' }, { 'word': 'college', 'position_in_text': 4, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'he', 'position_in_text': 10, 'error': 'Subject without a verb', 'suggestion': 'Ensure the subject is followed by a verb.' }, { 'word': 'reaction', 'position_in_text': 13, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'experiences', 'position_in_text': 24, 'error': 'Subject without a verb', 'suggestion': 'Ensure the subject is followed by a verb.' }, { 'word': 'experiences', 'position_in_text': 24, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'factory', 'position_in_text': 32, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'success', 'position_in_text': 38, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'leader', 'position_in_text': 42, 'error': 'Subject without a verb', 'suggestion': 'Ensure the subject is followed by a verb.' }, { 'word': 'environment', 'position_in_text': 52, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'clashes', 'position_in_text': 61, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'Illusion', 'position_in_text': 64, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'incense', 'position_in_text': 73, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'violence', 'position_in_text': 75, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'write', 'position_in_text': 77, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'which', 'position_in_text': 82, 'error': 'Subject without a verb', 'suggestion': 'Ensure the subject is followed by a verb.' }]
235
+
236
+ // Function to highlight mispronounced words and grammar errors
237
+ function highlightErrors(text, mispronouncedWords, grammarErrors) {
238
+ const textArray = text.split(" ");
239
+ const highlightedTextArray = textArray.map((word, index) => {
240
+ // Check for mispronounced words
241
+ const mispronounced = mispronouncedWords.find(mw => mw.word === word && mw.position_in_text === index);
242
+ if (mispronounced) {
243
+ return `<span class="highlight-mispronounced">${word}</span>`;
244
+ }
245
+
246
+ // Check for grammar errors
247
+ const grammarError = grammarErrors.find(ge => ge.word === word && ge.position_in_text === index);
248
+ console.log("GRAMMAR ERROR: ", grammarError)
249
+ if (grammarError) {
250
+ return `<span class="highlight-grammar">${word}</span>`;
251
+ }
252
+
253
+ return word; // Return unmodified if no errors
254
+ });
255
+
256
+ return highlightedTextArray.join(" ");
257
+ }
258
+
259
+ // Apply the highlighting function to the DisplayText
260
+ const highlightedText = highlightErrors(data.display_text, data.mispronunced_words, data.grammar_errors);
261
+
262
+ // Inject the highlighted text into an HTML element
263
+ document.getElementById("identified-text").innerHTML = highlightedText;
264
+
265
+ // Update feedback
266
+ document.getElementById("pronunciation-feedback").textContent = data.pronunciation_feedback;
267
+ document.getElementById("fluency-feedback").textContent = data.fluency_feedback;
268
+ document.getElementById("accuracy-feedback").textContent = data.accuracy_feedback;
269
+ document.getElementById("grammar-feedback").textContent = data.grammar_feedback;
270
+ document.getElementById("intonation-feedback").textContent = data.intonation_feedback;
271
+ document.getElementById("comprehension-feedback").textContent = data.comprehension_feedback;
272
+ })
273
+ .catch(error => console.error('Error:', error));
274
+ }
275
+
276
+ function updateProgress(scoreType, score) {
277
+ const progressBar = document.getElementById(`${scoreType}-progress`);
278
+ const scoreLabel = document.getElementById(`${scoreType}-score`);
279
+ progressBar.style.width = `${score}%`;
280
+ progressBar.setAttribute('aria-valuenow', score);
281
+ scoreLabel.textContent = `${score}%`;
282
+ }
283
+ </script>
284
+
285
+ </body>
286
+
287
+ </html>
app/utils/ai_speech.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import json
3
+ import os
4
+ import time
5
+
6
+ import azure.cognitiveservices.speech as speechsdk
7
+ from dotenv import load_dotenv
8
+ from pydub import AudioSegment
9
+
10
+ from app.config import settings
11
+
12
+
13
+ def text_details(response_json):
14
+ mispronunced_words = []
15
+ nbest = response_json.get("NBest", [])[0] # Assumes you use the top result
16
+ word_list = nbest.get("Words", [])
17
+
18
+ for word_info in word_list:
19
+ if word_info["PronunciationAssessment"]["ErrorType"] == "Mispronunciation":
20
+ # Collect necessary details
21
+ mispronunced_words.append(
22
+ {
23
+ "word": word_info["Word"],
24
+ "offset": word_info["Offset"],
25
+ "position_in_text": word_list.index(word_info), # Get index for order reference
26
+ }
27
+ )
28
+
29
+ display_text = nbest["Display"]
30
+
31
+ return {"mispronunced_words": mispronunced_words, "display_text": display_text}
32
+
33
+
34
+ def pronunciation_assessment(file_path, language):
35
+ if str(file_path).endswith("mp3"):
36
+ mp3_path = file_path
37
+ file_path = file_path.replace(".mp3", ".wav")
38
+ sound = AudioSegment.from_mp3(mp3_path)
39
+ sound.export(file_path, format="wav")
40
+
41
+ # Initialize speech config
42
+ speech_config = speechsdk.SpeechConfig(
43
+ subscription=settings.AZURE_AI_SUBSCRIPTION_KEY, region=settings.AZURE_AI_REGION
44
+ )
45
+ speech_config.speech_recognition_language = language
46
+
47
+ # Create pronunciation assessment config
48
+ pronunciation_config = speechsdk.PronunciationAssessmentConfig(
49
+ grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
50
+ granularity=speechsdk.PronunciationAssessmentGranularity.Phoneme,
51
+ enable_miscue=True,
52
+ )
53
+
54
+ # Initialize audio config from file
55
+ audio_config = speechsdk.audio.AudioConfig(filename=file_path)
56
+
57
+ # Initialize speech recognizer with the audio file
58
+ recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
59
+
60
+ # Apply pronunciation assessment configuration
61
+ pronunciation_config.apply_to(recognizer)
62
+
63
+ # Perform recognition and assessment
64
+ result = recognizer.recognize_once()
65
+
66
+ audio_text_details = text_details(json.loads(result.json))
67
+
68
+ if result.reason == speechsdk.ResultReason.RecognizedSpeech:
69
+ # Extract pronunciation assessment results
70
+ pronunciation_result = speechsdk.PronunciationAssessmentResult(result)
71
+
72
+ # Build result dictionary
73
+ analysis_results = {
74
+ "File": file_path,
75
+ "pronunciation_score": pronunciation_result.pronunciation_score,
76
+ "accuracy_score": pronunciation_result.accuracy_score,
77
+ "fluency_score": pronunciation_result.fluency_score,
78
+ "completeness_score": pronunciation_result.completeness_score,
79
+ "mispronunced_words": audio_text_details.get("mispronunced_words", []),
80
+ "display_text": audio_text_details.get("display_text", ""),
81
+ }
82
+
83
+ return analysis_results
app/utils/common.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import spacy
4
+ import speech_recognition as sr
5
+ from loguru import logger
6
+ from pydub import AudioSegment
7
+
8
+
9
+ # Load spaCy's English language model for grammar and text analysis
10
+ def get_doc(text):
11
+ try:
12
+ nlp = spacy.load("en_core_web_sm")
13
+ doc = nlp(text)
14
+ return doc
15
+ except OSError as ex:
16
+ logger.exception("Error while getting Spacy doc: ", str(ex))
17
+ logger.info("Please download the model with this command: python -m spacy download en_core_web_sm")
18
+ os.
19
+
20
+
21
+ # Convert MP3 to WAV using pydub
22
+ def convert_mp3_to_wav(mp3_file, wav_file="converted_comprehension_audio.wav"):
23
+ sound = AudioSegment.from_mp3(mp3_file)
24
+ sound.export(wav_file, format="wav")
25
+ return wav_file
26
+
27
+
28
+ # Convert WAV to text using SpeechRecognition
29
+ def transcribe_audio(wav_file):
30
+ recognizer = sr.Recognizer()
31
+ with sr.AudioFile(wav_file) as source:
32
+ audio_data = recognizer.record(source)
33
+ text = recognizer.recognize_google(audio_data)
34
+ return text
35
+
36
+
37
+ def split_audio(audio_path, chunk_length_ms=30000):
38
+ """Splits audio into chunks of specified length (in milliseconds)."""
39
+ audio = AudioSegment.from_wav(audio_path)
40
+ chunks = [audio[i : i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
41
+ chunk_paths = []
42
+
43
+ for idx, chunk in enumerate(chunks):
44
+ chunk_path = f"{audio_path}_chunk_{idx}.wav"
45
+ chunk.export(chunk_path, format="wav")
46
+ chunk_paths.append(chunk_path)
47
+
48
+ return chunk_paths
49
+
50
+
51
+ def aggregate_scores(chunk_scores):
52
+ """Aggregate scores across all chunks to produce a final score."""
53
+ final_score = {
54
+ "accuracy_score": 0,
55
+ "fluency_score": 0,
56
+ "completeness_score": 0,
57
+ "pronunciation_score": 0,
58
+ "mispronunced_words": [],
59
+ "display_text": "",
60
+ }
61
+ num_chunks = len(chunk_scores)
62
+
63
+ # Sum each score across chunks
64
+ for score in chunk_scores:
65
+ final_score["accuracy_score"] += score.get("accuracy_score", 0)
66
+ final_score["fluency_score"] += score.get("fluency_score", 0)
67
+ final_score["completeness_score"] += score.get("completeness_score", 0)
68
+ final_score["pronunciation_score"] += score.get("pronunciation_score", 0)
69
+ final_score["mispronunced_words"].extend(score.get("mispronunced_words"))
70
+ final_score["display_text"] += score.get("display_text")
71
+
72
+ # Average each score
73
+ for key in final_score:
74
+ if type(final_score[key]) in [float, int]:
75
+ final_score[key] = round(final_score[key] / num_chunks, 2)
76
+
77
+ return final_score
78
+
79
+
80
+ def remove_files(file_list):
81
+ try:
82
+ os.system("rm " + " ".join(file_list))
83
+ logger.info("All listed files removed.")
84
+ except Exception as ex:
85
+ logger.exception(f"Error removing files: {ex}")
app/utils/comprehension.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+
3
+ import spacy
4
+ import speech_recognition as sr
5
+ from pydub import AudioSegment
6
+
7
+
8
+ # Comprehension Score Evaluation (No Reference Text)
9
+ def evaluate_comprehension(text, doc):
10
+ # 1. Listening Comprehension: Evaluate clarity and coherence of the sentences
11
+ sentence_count = len(list(doc.sents))
12
+ sentence_length_avg = sum([len(sent) for sent in doc.sents]) / sentence_count if sentence_count > 0 else 0
13
+
14
+ if sentence_length_avg <= 10: # Short and simple sentences
15
+ listening_comprehension_score = 10
16
+ elif 10 < sentence_length_avg <= 20: # Moderate length sentences
17
+ listening_comprehension_score = 8
18
+ else: # Longer sentences may indicate less clarity
19
+ listening_comprehension_score = 6
20
+
21
+ # 2. Topic Relevance: Lexical diversity and internal consistency
22
+ word_list = [token.text.lower() for token in doc if token.is_alpha]
23
+ word_count = len(word_list)
24
+ unique_words = len(set(word_list))
25
+
26
+ if word_count == 0:
27
+ topic_relevance_score = 0
28
+ else:
29
+ lexical_diversity = unique_words / word_count
30
+ if lexical_diversity > 0.7: # High diversity, likely more relevant
31
+ topic_relevance_score = 10
32
+ elif 0.4 <= lexical_diversity <= 0.7: # Moderate diversity
33
+ topic_relevance_score = 7
34
+ else: # Low diversity indicates repetition, which may indicate off-topic speech
35
+ topic_relevance_score = 4
36
+
37
+ # 3. Understanding: Based on simplicity and grammatical structure
38
+ understanding_score = listening_comprehension_score # Reuse comprehension score since it's already based on clarity
39
+
40
+ # Final Comprehension Score: (Listening Comprehension + Topic Relevance + Understanding) / 3
41
+ comprehension_score = (listening_comprehension_score + topic_relevance_score + understanding_score) / 3
42
+
43
+ return {
44
+ "listening_comprehension": round(listening_comprehension_score, 2),
45
+ "topic_relevance": round(topic_relevance_score, 2),
46
+ "understanding": round(understanding_score, 2),
47
+ "comprehension_score": round(comprehension_score, 2),
48
+ }
app/utils/constants.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ APP_NAME = "Speech Analysis"
4
+ APP_DESCRIPTION = "Speech Analysis ML Inference API"
5
+ APP_VERSION = "v1"
6
+
7
+ API_BASE = "/"
8
+ API_PREFIX = "/api/" + APP_VERSION
9
+
10
+ BASE_PATH = os.path.dirname(os.path.abspath(__file__)).split("/app")[0]
11
+ UPLOAD_DIR = os.path.join(BASE_PATH, "audio")
12
+ STATIC_DIR = os.path.join(BASE_PATH, "app", "static")
13
+
14
+ API_READINESS = "/readiness"
15
+ API_LIVENESS = "/liveness"
app/utils/get_scores.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import os
3
+ from concurrent.futures import ThreadPoolExecutor
4
+
5
+ from loguru import logger
6
+
7
+ from app.utils.common import convert_mp3_to_wav, get_doc, transcribe_audio
8
+ from app.utils.comprehension import evaluate_comprehension
9
+ from app.utils.grammar import evaluate_grammar
10
+ from app.utils.intonation import evaluate_intonation
11
+
12
+
13
+ def get_content_score(audio_file):
14
+ if audio_file.endswith("mp3"):
15
+ wav_file = convert_mp3_to_wav(audio_file)
16
+ else:
17
+ wav_file = audio_file
18
+
19
+ # Step 2: Transcribe the WAV to text
20
+ transcript = transcribe_audio(wav_file)
21
+
22
+ doc = get_doc(transcript)
23
+
24
+ with ThreadPoolExecutor() as executor:
25
+ futures = [
26
+ executor.submit(evaluate_intonation, wav_file),
27
+ executor.submit(evaluate_grammar, transcript, doc),
28
+ executor.submit(evaluate_comprehension, transcript, doc),
29
+ ]
30
+
31
+ return {
32
+ "intonation_score": round((float(futures[0].result().get("intonation_score")) * 10), 2),
33
+ "grammar_score": round((float(futures[1].result().get("grammar_score")) * 10), 2),
34
+ "grammar_errors": futures[1].result().get("errors"),
35
+ "comprehension_score": round((float(futures[2].result().get("comprehension_score")) * 10), 2),
36
+ }
app/utils/grammar.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ import speech_recognition as sr
3
+ from pydub import AudioSegment
4
+
5
+
6
+ # Enhanced Sentence Structure Scoring
7
+ def evaluate_sentence_structure(doc):
8
+ sentence_structure_score = 0
9
+ sentence_count = len(list(doc.sents))
10
+
11
+ if sentence_count == 0:
12
+ return 0
13
+
14
+ for sentence in doc.sents:
15
+ # Check sentence length (moderate-length sentences should score higher)
16
+ sentence_length = len(sentence)
17
+ if 10 <= sentence_length <= 20:
18
+ sentence_length_score = 10
19
+ elif 20 < sentence_length <= 30:
20
+ sentence_length_score = 8
21
+ else:
22
+ sentence_length_score = 5 if sentence_length > 30 else 2
23
+
24
+ # Syntactic complexity: Count unique dependency types in the sentence
25
+ unique_dependencies = len(set([token.dep_ for token in sentence if token.dep_ != "punct"]))
26
+ if unique_dependencies > 6: # Complex sentence
27
+ syntax_complexity_score = 10
28
+ elif 3 <= unique_dependencies <= 6: # Moderately complex
29
+ syntax_complexity_score = 7
30
+ else:
31
+ syntax_complexity_score = 4 # Simple sentence
32
+
33
+ # Average the length and syntax scores
34
+ sentence_score = (sentence_length_score + syntax_complexity_score) / 2
35
+ sentence_structure_score += sentence_score
36
+
37
+ # Final average score for all sentences in the document
38
+ return min((sentence_structure_score / sentence_count), 10)
39
+
40
+
41
+ # Grammar scoring based on Sentence Structure, Grammar Usage, and Vocabulary Range
42
+ def evaluate_grammar(text, doc):
43
+ # 1. Sentence Structure
44
+ sentence_structure_score = evaluate_sentence_structure(doc)
45
+ errors = []
46
+
47
+ # 2. Grammar Usage: Use the same logic for now
48
+ # Example:
49
+ # - The boy hungry. # Wrong
50
+ # - The boy is hungry # Correct
51
+ grammar_usage_score = 10 # Assume perfect grammar initially
52
+ for token in doc:
53
+ if token.dep_ == "nsubj" and token.head.pos_ != "VERB":
54
+ errors.append(
55
+ {
56
+ "word": token.text,
57
+ "position_in_text": token.i,
58
+ "error": "Subject without a verb",
59
+ "suggestion": "Ensure the subject is followed by a verb.",
60
+ }
61
+ )
62
+ grammar_usage_score -= 2 # Deduct points for common grammar mistakes
63
+
64
+ # 3. Vocabulary Range
65
+ unique_words = set([token.text.lower() for token in doc if token.is_alpha]) # Get Unique words in lower case
66
+ vocabulary_range_score = min(len(unique_words) / len(doc), 1) * 10 # Lexical diversity
67
+
68
+ # Final Grammar Score using the formula: (Sentence Structure + Grammar Usage + Vocabulary Range) / 3
69
+ grammar_score = (sentence_structure_score + grammar_usage_score + vocabulary_range_score) / 3
70
+
71
+ return {
72
+ "sentence_structure": round(sentence_structure_score, 2),
73
+ "grammar_usage": round(grammar_usage_score, 2),
74
+ "vocabulary_range": round(vocabulary_range_score, 2),
75
+ "grammar_score": round(grammar_score, 2),
76
+ "errors": errors,
77
+ "text": text,
78
+ }
app/utils/intonation.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ from pydub import AudioSegment
4
+
5
+
6
+ # Feature extraction for Intonation
7
+ def evaluate_intonation(wav_file):
8
+ # Load audio using librosa
9
+ y, sr = librosa.load(wav_file)
10
+
11
+ # 1. Sentence Stress (based on energy)
12
+ # Calculate Root mean square energy which represents the perceived loudness or power of the audio, which can be linked to sentence stress
13
+ rms_energy = librosa.feature.rms(y=y)[0]
14
+ avg_energy = np.mean(rms_energy) * 10 # Scale up for score calculation
15
+ avg_energy = float(avg_energy) # Ensure scalar
16
+
17
+ # 2. Intonation Patterns (based on pitch variation)
18
+ # Estimate the pitch (fundamental frequency) of the audio signal over time.
19
+ pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
20
+ pitch_values = pitches[magnitudes > np.median(magnitudes)]
21
+ if len(pitch_values) > 0:
22
+ pitch_variation = np.std(pitch_values) / np.mean(pitch_values) * 10
23
+ else:
24
+ pitch_variation = 0
25
+ pitch_variation = float(pitch_variation) # Ensure scalar
26
+
27
+ # 3. Rhythm (based on tempo)
28
+ # Estimates the tempo of the audio in beats per minute (BPM) by tracking the rhythmic structure (detecting beats in the signal).
29
+ # Average speaking tempo is ~120 BPM
30
+ tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
31
+ rhythm_score = (tempo / 120) * 10 # Adjust based on average speaking tempo (120 BPM)
32
+ rhythm_score = float(rhythm_score) # Ensure scalar
33
+
34
+ # Average score as per the formula: (Sentence Stress + Intonation Patterns + Rhythm) / 3
35
+ # Normalize the scores before calculating the final score
36
+ sentence_stress_score = min(max(avg_energy, 0), 10) # Ensure it's within [0, 10]
37
+ intonation_patterns_score = min(max(pitch_variation, 0), 10) # Ensure it's within [0, 10]
38
+ rhythm_score = min(max(rhythm_score, 0), 10) # Ensure it's within [0, 10]
39
+
40
+ # Intonation score calculation
41
+ intonation_score = (sentence_stress_score + intonation_patterns_score + rhythm_score) / 3
42
+
43
+ return {
44
+ "sentence_stress": sentence_stress_score,
45
+ "intonation_patterns": intonation_patterns_score,
46
+ "rhythm": rhythm_score,
47
+ "intonation_score": intonation_score,
48
+ }
docker-compose.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: "3.8"
2
+
3
+ services:
4
+ web:
5
+ build: .
6
+ command: "uvicorn app.main:server --host 0.0.0.0 --port 7860"
7
+ volumes:
8
+ - .:/app
9
+ ports:
10
+ - "7860:7860"
11
+ env_file:
12
+ - .env
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pydub
4
+ spacy
5
+ SpeechRecognition==3.10.4
6
+ loguru
7
+ numpy
8
+ librosa
9
+ azure-cognitiveservices-speech
10
+ python-dotenv
11
+ pydantic-settings
12
+ python-multipart