Spaces:
Runtime error
Runtime error
Commit ·
79b7942
1
Parent(s): ddddf1b
init
Browse files- .gitignore +61 -0
- Dockerfile +21 -0
- __init__.py +0 -0
- app/__init__.py +0 -0
- app/config.py +14 -0
- app/main.py +49 -0
- app/manage.py +27 -0
- app/middlewares/logging.py +42 -0
- app/models/speech_analysis.py +20 -0
- app/services/healthcheck.py +15 -0
- app/services/speech_analysis.py +75 -0
- app/static/index.html +287 -0
- app/utils/ai_speech.py +83 -0
- app/utils/common.py +85 -0
- app/utils/comprehension.py +48 -0
- app/utils/constants.py +15 -0
- app/utils/get_scores.py +36 -0
- app/utils/grammar.py +78 -0
- app/utils/intonation.py +48 -0
- docker-compose.yml +12 -0
- requirements.txt +12 -0
.gitignore
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Compiled Java class files
|
| 2 |
+
*.class
|
| 3 |
+
|
| 4 |
+
# Compiled Python bytecode
|
| 5 |
+
*.py[cod]
|
| 6 |
+
|
| 7 |
+
# Log files
|
| 8 |
+
*.log
|
| 9 |
+
|
| 10 |
+
# Package files
|
| 11 |
+
*.jar
|
| 12 |
+
|
| 13 |
+
# Maven
|
| 14 |
+
target/
|
| 15 |
+
dist/
|
| 16 |
+
|
| 17 |
+
# JetBrains IDE
|
| 18 |
+
.idea/
|
| 19 |
+
|
| 20 |
+
# Unit test reports
|
| 21 |
+
TEST*.xml
|
| 22 |
+
|
| 23 |
+
# Generated by MacOS
|
| 24 |
+
.DS_Store
|
| 25 |
+
|
| 26 |
+
# Generated by Windows
|
| 27 |
+
Thumbs.db
|
| 28 |
+
|
| 29 |
+
# Applications
|
| 30 |
+
*.app
|
| 31 |
+
*.exe
|
| 32 |
+
*.war
|
| 33 |
+
|
| 34 |
+
# Large media files
|
| 35 |
+
*.mp4
|
| 36 |
+
*.tiff
|
| 37 |
+
*.avi
|
| 38 |
+
*.flv
|
| 39 |
+
*.mov
|
| 40 |
+
*.wmv
|
| 41 |
+
|
| 42 |
+
# Byte-compiled / optimized / DLL files
|
| 43 |
+
__pycache__/
|
| 44 |
+
*/__pycache__/
|
| 45 |
+
*/*/__pycache__/
|
| 46 |
+
*/*/*/__pycache__/
|
| 47 |
+
*.py[cod]
|
| 48 |
+
*/*.py[cod]
|
| 49 |
+
*/*/*.py[cod]
|
| 50 |
+
*/*/*/*.py[cod]
|
| 51 |
+
|
| 52 |
+
# VSCode IDE
|
| 53 |
+
.vscode/
|
| 54 |
+
|
| 55 |
+
# Environment
|
| 56 |
+
env/
|
| 57 |
+
venv/
|
| 58 |
+
.env
|
| 59 |
+
.venv
|
| 60 |
+
|
| 61 |
+
audio/
|
Dockerfile
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use an official Python runtime as a parent image
|
| 2 |
+
FROM python:3.9-slim
|
| 3 |
+
|
| 4 |
+
# Set the working directory
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install dependencies
|
| 8 |
+
COPY requirements.txt .
|
| 9 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 10 |
+
|
| 11 |
+
# Install spaCy model
|
| 12 |
+
RUN python -m spacy download en_core_web_sm
|
| 13 |
+
|
| 14 |
+
# Copy the current directory contents into the container
|
| 15 |
+
COPY . .
|
| 16 |
+
|
| 17 |
+
# Expose the port FastAPI will run on
|
| 18 |
+
EXPOSE 7860
|
| 19 |
+
|
| 20 |
+
# Command to run the application with Gunicorn and Uvicorn workers
|
| 21 |
+
CMD ["uvicorn", "app.main:server", "--host", "0.0.0.0", "--port", "7860", "--workers", "4"]
|
__init__.py
ADDED
|
File without changes
|
app/__init__.py
ADDED
|
File without changes
|
app/config.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from pydantic_settings import BaseSettings
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class Settings(BaseSettings):
|
| 7 |
+
AZURE_AI_SUBSCRIPTION_KEY: str = os.getenv("AZURE_AI_SUBSCRIPTION_KEY")
|
| 8 |
+
AZURE_AI_REGION: str = os.getenv("AZURE_AI_REGION")
|
| 9 |
+
|
| 10 |
+
class Config:
|
| 11 |
+
env_file = ".env"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
settings = Settings()
|
app/main.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
import uvicorn
|
| 4 |
+
from fastapi import FastAPI
|
| 5 |
+
from fastapi.responses import HTMLResponse
|
| 6 |
+
from fastapi.staticfiles import StaticFiles
|
| 7 |
+
|
| 8 |
+
from app.config import settings
|
| 9 |
+
from app.manage import configure_app
|
| 10 |
+
from app.utils.constants import APP_DESCRIPTION, APP_NAME, APP_VERSION, DOCS_URL, OPENAPI_URL, STATIC_DIR
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def initialize_app():
|
| 14 |
+
"""Initializes the app by adding required configurations using configure_app
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
app: returns configured app's FastAPI instance.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
fastapi_app = FastAPI(title=APP_NAME, version=APP_VERSION, description=APP_DESCRIPTION)
|
| 21 |
+
|
| 22 |
+
# app configuration - one time
|
| 23 |
+
configure_app(fastapi_app)
|
| 24 |
+
|
| 25 |
+
# Mount the static folder
|
| 26 |
+
fastapi_app.mount(
|
| 27 |
+
"/static",
|
| 28 |
+
StaticFiles(directory=STATIC_DIR),
|
| 29 |
+
name="static",
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
@fastapi_app.get("/")
|
| 33 |
+
async def read_index():
|
| 34 |
+
with open(os.path.join(STATIC_DIR, "index.html")) as f:
|
| 35 |
+
return HTMLResponse(content=f.read(), status_code=200)
|
| 36 |
+
|
| 37 |
+
return fastapi_app
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# initialize app at the end
|
| 41 |
+
server = initialize_app()
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def start():
|
| 45 |
+
uvicorn.run(server, host="0.0.0.0", port=8000)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
if __name__ == "__main__":
|
| 49 |
+
start()
|
app/manage.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 2 |
+
|
| 3 |
+
from app.middlewares.logging import RequestLoggingMiddleware
|
| 4 |
+
from app.services.speech_analysis import speech_analysis_router
|
| 5 |
+
from app.utils.constants import API_PREFIX
|
| 6 |
+
|
| 7 |
+
__all__ = ["configure_app"]
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def configure_app(app):
|
| 11 |
+
"""Configure and return Fast Application by adding routers and middlewares to the app"""
|
| 12 |
+
# Configure router
|
| 13 |
+
app.include_router(speech_analysis_router, prefix=API_PREFIX)
|
| 14 |
+
|
| 15 |
+
# configure middlewares
|
| 16 |
+
app.add_middleware(RequestLoggingMiddleware)
|
| 17 |
+
|
| 18 |
+
# CORS settings to allow frontend requests
|
| 19 |
+
app.add_middleware(
|
| 20 |
+
CORSMiddleware,
|
| 21 |
+
allow_origins=["*"],
|
| 22 |
+
allow_credentials=True,
|
| 23 |
+
allow_methods=["*"],
|
| 24 |
+
allow_headers=["*"],
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
return app
|
app/middlewares/logging.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import uuid
|
| 3 |
+
import time
|
| 4 |
+
|
| 5 |
+
import loguru
|
| 6 |
+
from starlette.middleware.base import BaseHTTPMiddleware
|
| 7 |
+
|
| 8 |
+
logger = loguru.logger
|
| 9 |
+
logger.remove()
|
| 10 |
+
logger.add(
|
| 11 |
+
sys.stdout,
|
| 12 |
+
format="{time} - {level} - {file.path} - {function} - {line} - ({extra[request_id]}) :: {message}",
|
| 13 |
+
level="DEBUG",
|
| 14 |
+
backtrace=False,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class RequestLoggingMiddleware(BaseHTTPMiddleware):
|
| 19 |
+
"""Middleware for logging incoming HTTP requests and their responses.
|
| 20 |
+
|
| 21 |
+
Attributes:
|
| 22 |
+
app: The ASGI application to which the middleware is applied.
|
| 23 |
+
|
| 24 |
+
Methods:
|
| 25 |
+
dispatch: Method to intercept and handle HTTP requests. It logs the start of the request, executes the
|
| 26 |
+
subsequent middleware and the ASGI application, logs any exceptions that occur during the
|
| 27 |
+
processing of the request, and logs the end of the request.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
def __init__(self, app):
|
| 31 |
+
super().__init__(app)
|
| 32 |
+
|
| 33 |
+
async def dispatch(self, request, call_next):
|
| 34 |
+
request_id = str(uuid.uuid4())
|
| 35 |
+
with logger.contextualize(request_id=request_id):
|
| 36 |
+
logger.info("Request started")
|
| 37 |
+
try:
|
| 38 |
+
return await call_next(request)
|
| 39 |
+
except Exception as ex:
|
| 40 |
+
logger.exception(f"Request failed: {ex}")
|
| 41 |
+
finally:
|
| 42 |
+
logger.info("Request ended")
|
app/models/speech_analysis.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class SpeechAnalysisResponse(BaseModel):
|
| 5 |
+
fluency_score: float
|
| 6 |
+
pronunciation_score: float
|
| 7 |
+
completeness_score: float
|
| 8 |
+
accuracy_score: float
|
| 9 |
+
grammar_score: float
|
| 10 |
+
comprehension_score: float
|
| 11 |
+
intonation_score: float
|
| 12 |
+
pronunciation_feedback: str
|
| 13 |
+
fluency_feedback: str
|
| 14 |
+
accuracy_feedback: str
|
| 15 |
+
grammar_feedback: str
|
| 16 |
+
intonation_feedback: str
|
| 17 |
+
comprehension_feedback: str
|
| 18 |
+
grammar_errors: list
|
| 19 |
+
mispronunced_words: list
|
| 20 |
+
display_text: str
|
app/services/healthcheck.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import APIRouter, Response
|
| 2 |
+
|
| 3 |
+
from app.utils.constants import API_READINESS, API_LIVENESS
|
| 4 |
+
|
| 5 |
+
router = APIRouter()
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@router.get(API_READINESS, include_in_schema=False)
|
| 9 |
+
async def readiness_controller(response: Response):
|
| 10 |
+
response.status_code = 200
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@router.get(API_LIVENESS, include_in_schema=False)
|
| 14 |
+
async def liveness_controller(response: Response):
|
| 15 |
+
response.status_code = 200
|
app/services/speech_analysis.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
import time
|
| 4 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 5 |
+
|
| 6 |
+
from fastapi import APIRouter, FastAPI, File, Form, UploadFile
|
| 7 |
+
from fastapi.responses import HTMLResponse
|
| 8 |
+
from pydub import AudioSegment
|
| 9 |
+
|
| 10 |
+
from app.utils.ai_speech import pronunciation_assessment
|
| 11 |
+
from app.utils.common import aggregate_scores, remove_files, split_audio
|
| 12 |
+
from app.utils.get_scores import get_content_score
|
| 13 |
+
from app.models.speech_analysis import SpeechAnalysisResponse
|
| 14 |
+
from app.utils.constants import UPLOAD_DIR
|
| 15 |
+
from loguru import logger
|
| 16 |
+
|
| 17 |
+
speech_analysis_router = APIRouter()
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@speech_analysis_router.post("/analyze", response_model=SpeechAnalysisResponse)
|
| 21 |
+
async def analyze_speech(audio_file: UploadFile = File(...), language: str = Form(...)):
|
| 22 |
+
# Make audio directory if not available
|
| 23 |
+
os.makedirs(UPLOAD_DIR, exist_ok=True)
|
| 24 |
+
|
| 25 |
+
# Save the uploaded file
|
| 26 |
+
s_time = time.time()
|
| 27 |
+
audio_file_path = os.path.join(UPLOAD_DIR, audio_file.filename)
|
| 28 |
+
|
| 29 |
+
with open(audio_file_path, "wb") as buffer:
|
| 30 |
+
shutil.copyfileobj(audio_file.file, buffer)
|
| 31 |
+
|
| 32 |
+
if audio_file_path.endswith("mp3"):
|
| 33 |
+
audio_file_path = convert_mp3_to_wav(audio_file_path, audio_file_path.replace("mp3", "wav"))
|
| 34 |
+
|
| 35 |
+
# Split audio if longer than 30 seconds
|
| 36 |
+
audio_length_ms = AudioSegment.from_wav(audio_file_path).duration_seconds * 1000
|
| 37 |
+
if audio_length_ms > 30000:
|
| 38 |
+
chunk_paths = split_audio(audio_file_path)
|
| 39 |
+
else:
|
| 40 |
+
chunk_paths = [audio_file_path] # No splitting needed
|
| 41 |
+
|
| 42 |
+
# Process each chunk for azure_speech_score in parallel
|
| 43 |
+
with ThreadPoolExecutor() as executor:
|
| 44 |
+
content_future = executor.submit(get_content_score, audio_file_path)
|
| 45 |
+
pronunciation_futures = [executor.submit(pronunciation_assessment, path, language) for path in chunk_paths]
|
| 46 |
+
content_score = content_future.result()
|
| 47 |
+
chunk_scores = [future.result() for future in pronunciation_futures]
|
| 48 |
+
|
| 49 |
+
# Aggregate azure_speech_scores across chunks
|
| 50 |
+
final_azure_speech_score = aggregate_scores(chunk_scores)
|
| 51 |
+
|
| 52 |
+
# Combine final results
|
| 53 |
+
final_azure_speech_score.update(
|
| 54 |
+
{
|
| 55 |
+
"grammar_score": content_score.get("grammar_score"),
|
| 56 |
+
"intonation_score": content_score.get("intonation_score"),
|
| 57 |
+
"comprehension_score": content_score.get("comprehension_score"),
|
| 58 |
+
"grammar_errors": content_score.get("grammar_errors"),
|
| 59 |
+
"pronunciation_feedback": "Demo Content",
|
| 60 |
+
"fluency_feedback": "Demo Content",
|
| 61 |
+
"accuracy_feedback": "Demo Content",
|
| 62 |
+
"grammar_feedback": "Demo Content",
|
| 63 |
+
"intonation_feedback": "Demo Content",
|
| 64 |
+
"comprehension_feedback": "Demo Content",
|
| 65 |
+
}
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
e_time = time.time()
|
| 69 |
+
total_time = e_time - s_time
|
| 70 |
+
logger.info(str(total_time))
|
| 71 |
+
|
| 72 |
+
chunk_paths.append(audio_file_path)
|
| 73 |
+
remove_files(chunk_paths)
|
| 74 |
+
|
| 75 |
+
return final_azure_speech_score
|
app/static/index.html
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
|
| 4 |
+
<head>
|
| 5 |
+
<meta charset="UTF-8">
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 7 |
+
<title>Speech Analysis</title>
|
| 8 |
+
<!-- Bootstrap CSS -->
|
| 9 |
+
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css" rel="stylesheet">
|
| 10 |
+
<style>
|
| 11 |
+
body {
|
| 12 |
+
background-color: #f4f7f8;
|
| 13 |
+
font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
.container {
|
| 17 |
+
margin-top: 50px;
|
| 18 |
+
padding: 20px;
|
| 19 |
+
background: #fff;
|
| 20 |
+
box-shadow: 0 0 15px rgba(0, 0, 0, 0.1);
|
| 21 |
+
border-radius: 10px;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
.progress-bar-inner {
|
| 25 |
+
width: 0%;
|
| 26 |
+
transition: width 1s ease-in-out;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
.audio-player {
|
| 30 |
+
margin-top: 20px;
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
.btn-analyze {
|
| 34 |
+
background-color: #007bff;
|
| 35 |
+
color: white;
|
| 36 |
+
border-radius: 5px;
|
| 37 |
+
padding: 10px 20px;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
.btn-analyze:hover {
|
| 41 |
+
background-color: #0056b3;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
.score-label {
|
| 45 |
+
display: flex;
|
| 46 |
+
justify-content: space-between;
|
| 47 |
+
font-weight: 600;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
.feedback-section {
|
| 51 |
+
margin-top: 30px;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
.highlight-mispronounced {
|
| 55 |
+
background-color: yellow;
|
| 56 |
+
font-weight: bold;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
.highlight {
|
| 60 |
+
background-color: yellow;
|
| 61 |
+
font-weight: bold;
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
.highlight-grammar {
|
| 65 |
+
background-color: lightpink;
|
| 66 |
+
/* or any other color */
|
| 67 |
+
font-weight: bold;
|
| 68 |
+
}
|
| 69 |
+
</style>
|
| 70 |
+
</head>
|
| 71 |
+
|
| 72 |
+
<body>
|
| 73 |
+
|
| 74 |
+
<div class="container">
|
| 75 |
+
<h1 class="text-center mb-4">Speech Analysis</h1>
|
| 76 |
+
|
| 77 |
+
<div class="mb-3">
|
| 78 |
+
<label for="audio-file" class="form-label">Upload your audio file:</label>
|
| 79 |
+
<input type="file" class="form-control" id="audio-file" accept="audio/*">
|
| 80 |
+
</div>
|
| 81 |
+
|
| 82 |
+
<!-- Language Dropdown -->
|
| 83 |
+
<div class="mb-3">
|
| 84 |
+
<label for="language-select" class="form-label">Select Language:</label>
|
| 85 |
+
<select class="form-select" id="language-select">
|
| 86 |
+
<option value="en-GB">English (United Kingdom)</option>
|
| 87 |
+
<option value="nb-NO">Norwegian</option>
|
| 88 |
+
</select>
|
| 89 |
+
</div>
|
| 90 |
+
|
| 91 |
+
<div class="text-center mb-4">
|
| 92 |
+
<button class="btn btn-analyze" onclick="analyzeAudio()">Analyze Speech</button>
|
| 93 |
+
</div>
|
| 94 |
+
|
| 95 |
+
<div class="audio-player text-center" id="audio-player-container" style="display: none;">
|
| 96 |
+
<audio id="audio-player" controls></audio>
|
| 97 |
+
</div>
|
| 98 |
+
|
| 99 |
+
<h3 class="mt-5">Speech Scores</h3>
|
| 100 |
+
|
| 101 |
+
<!-- Scores with Progress Bars -->
|
| 102 |
+
<div id="scores-container">
|
| 103 |
+
<div class="mb-3">
|
| 104 |
+
<div class="score-label">
|
| 105 |
+
<span>Fluency Score</span>
|
| 106 |
+
<span id="fluency-score">0%</span>
|
| 107 |
+
</div>
|
| 108 |
+
<div class="progress">
|
| 109 |
+
<div class="progress-bar progress-bar-inner bg-success" id="fluency-progress" role="progressbar"
|
| 110 |
+
aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
|
| 111 |
+
</div>
|
| 112 |
+
</div>
|
| 113 |
+
|
| 114 |
+
<div class="mb-3">
|
| 115 |
+
<div class="score-label">
|
| 116 |
+
<span>Pronunciation Score</span>
|
| 117 |
+
<span id="pronunciation-score">0%</span>
|
| 118 |
+
</div>
|
| 119 |
+
<div class="progress">
|
| 120 |
+
<div class="progress-bar progress-bar-inner bg-primary" id="pronunciation-progress"
|
| 121 |
+
role="progressbar" aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
|
| 122 |
+
</div>
|
| 123 |
+
</div>
|
| 124 |
+
|
| 125 |
+
<div class="mb-3">
|
| 126 |
+
<div class="score-label">
|
| 127 |
+
<span>Completeness Score</span>
|
| 128 |
+
<span id="completeness-score">0%</span>
|
| 129 |
+
</div>
|
| 130 |
+
<div class="progress">
|
| 131 |
+
<div class="progress-bar progress-bar-inner bg-warning" id="completeness-progress"
|
| 132 |
+
role="progressbar" aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
|
| 133 |
+
</div>
|
| 134 |
+
</div>
|
| 135 |
+
|
| 136 |
+
<div class="mb-3">
|
| 137 |
+
<div class="score-label">
|
| 138 |
+
<span>Accuracy Score</span>
|
| 139 |
+
<span id="accuracy-score">0%</span>
|
| 140 |
+
</div>
|
| 141 |
+
<div class="progress">
|
| 142 |
+
<div class="progress-bar progress-bar-inner bg-danger" id="accuracy-progress" role="progressbar"
|
| 143 |
+
aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
|
| 144 |
+
</div>
|
| 145 |
+
</div>
|
| 146 |
+
|
| 147 |
+
<div class="mb-3">
|
| 148 |
+
<div class="score-label">
|
| 149 |
+
<span>Grammar Score</span>
|
| 150 |
+
<span id="grammar-score">0%</span>
|
| 151 |
+
</div>
|
| 152 |
+
<div class="progress">
|
| 153 |
+
<div class="progress-bar progress-bar-inner bg-info" id="grammar-progress" role="progressbar"
|
| 154 |
+
aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
|
| 155 |
+
</div>
|
| 156 |
+
</div>
|
| 157 |
+
|
| 158 |
+
<div class="mb-3">
|
| 159 |
+
<div class="score-label">
|
| 160 |
+
<span>Comprehension Score</span>
|
| 161 |
+
<span id="comprehension-score">0%</span>
|
| 162 |
+
</div>
|
| 163 |
+
<div class="progress">
|
| 164 |
+
<div class="progress-bar progress-bar-inner bg-dark" id="comprehension-progress" role="progressbar"
|
| 165 |
+
aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
|
| 166 |
+
</div>
|
| 167 |
+
</div>
|
| 168 |
+
|
| 169 |
+
<div class="mb-3">
|
| 170 |
+
<div class="score-label">
|
| 171 |
+
<span>Intonation Score</span>
|
| 172 |
+
<span id="intonation-score">0%</span>
|
| 173 |
+
</div>
|
| 174 |
+
<div class="progress">
|
| 175 |
+
<div class="progress-bar progress-bar-inner bg-secondary" id="intonation-progress"
|
| 176 |
+
role="progressbar" aria-valuenow="0" aria-valuemin="0" aria-valuemax="100"></div>
|
| 177 |
+
</div>
|
| 178 |
+
</div>
|
| 179 |
+
</div>
|
| 180 |
+
|
| 181 |
+
<div class="feedback-section">
|
| 182 |
+
<h4>Speech Analysis Feedback</h4>
|
| 183 |
+
<p><strong>Identified Text:</strong> <span id="identified-text"></span></p>
|
| 184 |
+
|
| 185 |
+
<div id="feedback-container">
|
| 186 |
+
<!-- <p><strong>Pronunciation Feedback:</strong> <span id="pronunciation-feedback"></span></p> -->
|
| 187 |
+
<!-- <p><strong>Fluency Feedback:</strong> <span id="fluency-feedback"></span></p> -->
|
| 188 |
+
<!-- <p><strong>Accuracy Feedback:</strong> <span id="accuracy-feedback"></span></p> -->
|
| 189 |
+
<!-- <p><strong>Grammar Feedback:</strong> <span id="grammar-feedback"></span></p> -->
|
| 190 |
+
<!-- <p><strong>Intonation Feedback:</strong> <span id="intonation-feedback"></span></p>
|
| 191 |
+
<p><strong>Comprehension Feedback:</strong> <span id="comprehension-feedback"></span></p> -->
|
| 192 |
+
</div>
|
| 193 |
+
</div>
|
| 194 |
+
</div>
|
| 195 |
+
|
| 196 |
+
<!-- Bootstrap JS and Popper.js -->
|
| 197 |
+
<script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.11.7/dist/umd/popper.min.js"></script>
|
| 198 |
+
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/js/bootstrap.min.js"></script>
|
| 199 |
+
|
| 200 |
+
<script>
|
| 201 |
+
function analyzeAudio() {
|
| 202 |
+
const audioFile = document.getElementById("audio-file").files[0];
|
| 203 |
+
const language = document.getElementById("language-select").value;
|
| 204 |
+
|
| 205 |
+
if (!audioFile) {
|
| 206 |
+
alert("Please upload an audio file.");
|
| 207 |
+
return;
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
const formData = new FormData();
|
| 211 |
+
formData.append("audio_file", audioFile);
|
| 212 |
+
formData.append("language", language)
|
| 213 |
+
|
| 214 |
+
fetch("http://127.0.0.1:7860/api/v1/analyze", {
|
| 215 |
+
method: "POST",
|
| 216 |
+
body: formData
|
| 217 |
+
})
|
| 218 |
+
.then(response => response.json())
|
| 219 |
+
.then(data => {
|
| 220 |
+
updateProgress('fluency', data.fluency_score);
|
| 221 |
+
updateProgress('pronunciation', data.pronunciation_score);
|
| 222 |
+
updateProgress('completeness', data.completeness_score);
|
| 223 |
+
updateProgress('accuracy', data.accuracy_score);
|
| 224 |
+
updateProgress('grammar', data.grammar_score);
|
| 225 |
+
updateProgress('comprehension', data.comprehension_score);
|
| 226 |
+
updateProgress('intonation', data.intonation_score);
|
| 227 |
+
|
| 228 |
+
const audioPlayer = document.getElementById("audio-player");
|
| 229 |
+
const audioURL = URL.createObjectURL(audioFile);
|
| 230 |
+
audioPlayer.src = audioURL;
|
| 231 |
+
document.getElementById("audio-player-container").style.display = "block";
|
| 232 |
+
|
| 233 |
+
// Example data to simulate the transcript and errors from the backend
|
| 234 |
+
// const grammar_errors = [{ 'word': 'dismissal', 'position_in_text': 2, 'error': 'Subject without a verb', 'suggestion': 'Ensure the subject is followed by a verb.' }, { 'word': 'college', 'position_in_text': 4, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'he', 'position_in_text': 10, 'error': 'Subject without a verb', 'suggestion': 'Ensure the subject is followed by a verb.' }, { 'word': 'reaction', 'position_in_text': 13, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'experiences', 'position_in_text': 24, 'error': 'Subject without a verb', 'suggestion': 'Ensure the subject is followed by a verb.' }, { 'word': 'experiences', 'position_in_text': 24, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'factory', 'position_in_text': 32, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'success', 'position_in_text': 38, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'leader', 'position_in_text': 42, 'error': 'Subject without a verb', 'suggestion': 'Ensure the subject is followed by a verb.' }, { 'word': 'environment', 'position_in_text': 52, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'clashes', 'position_in_text': 61, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'Illusion', 'position_in_text': 64, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'incense', 'position_in_text': 73, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'violence', 'position_in_text': 75, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'write', 'position_in_text': 77, 'error': 'Missing article', 'suggestion': "Add an article ('a', 'an', 'the') before this noun." }, { 'word': 'which', 'position_in_text': 82, 'error': 'Subject without a verb', 'suggestion': 'Ensure the subject is followed by a verb.' }]
|
| 235 |
+
|
| 236 |
+
// Function to highlight mispronounced words and grammar errors
|
| 237 |
+
function highlightErrors(text, mispronouncedWords, grammarErrors) {
|
| 238 |
+
const textArray = text.split(" ");
|
| 239 |
+
const highlightedTextArray = textArray.map((word, index) => {
|
| 240 |
+
// Check for mispronounced words
|
| 241 |
+
const mispronounced = mispronouncedWords.find(mw => mw.word === word && mw.position_in_text === index);
|
| 242 |
+
if (mispronounced) {
|
| 243 |
+
return `<span class="highlight-mispronounced">${word}</span>`;
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
// Check for grammar errors
|
| 247 |
+
const grammarError = grammarErrors.find(ge => ge.word === word && ge.position_in_text === index);
|
| 248 |
+
console.log("GRAMMAR ERROR: ", grammarError)
|
| 249 |
+
if (grammarError) {
|
| 250 |
+
return `<span class="highlight-grammar">${word}</span>`;
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
+
return word; // Return unmodified if no errors
|
| 254 |
+
});
|
| 255 |
+
|
| 256 |
+
return highlightedTextArray.join(" ");
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
// Apply the highlighting function to the DisplayText
|
| 260 |
+
const highlightedText = highlightErrors(data.display_text, data.mispronunced_words, data.grammar_errors);
|
| 261 |
+
|
| 262 |
+
// Inject the highlighted text into an HTML element
|
| 263 |
+
document.getElementById("identified-text").innerHTML = highlightedText;
|
| 264 |
+
|
| 265 |
+
// Update feedback
|
| 266 |
+
document.getElementById("pronunciation-feedback").textContent = data.pronunciation_feedback;
|
| 267 |
+
document.getElementById("fluency-feedback").textContent = data.fluency_feedback;
|
| 268 |
+
document.getElementById("accuracy-feedback").textContent = data.accuracy_feedback;
|
| 269 |
+
document.getElementById("grammar-feedback").textContent = data.grammar_feedback;
|
| 270 |
+
document.getElementById("intonation-feedback").textContent = data.intonation_feedback;
|
| 271 |
+
document.getElementById("comprehension-feedback").textContent = data.comprehension_feedback;
|
| 272 |
+
})
|
| 273 |
+
.catch(error => console.error('Error:', error));
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
function updateProgress(scoreType, score) {
|
| 277 |
+
const progressBar = document.getElementById(`${scoreType}-progress`);
|
| 278 |
+
const scoreLabel = document.getElementById(`${scoreType}-score`);
|
| 279 |
+
progressBar.style.width = `${score}%`;
|
| 280 |
+
progressBar.setAttribute('aria-valuenow', score);
|
| 281 |
+
scoreLabel.textContent = `${score}%`;
|
| 282 |
+
}
|
| 283 |
+
</script>
|
| 284 |
+
|
| 285 |
+
</body>
|
| 286 |
+
|
| 287 |
+
</html>
|
app/utils/ai_speech.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import csv
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
import azure.cognitiveservices.speech as speechsdk
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
from pydub import AudioSegment
|
| 9 |
+
|
| 10 |
+
from app.config import settings
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def text_details(response_json):
|
| 14 |
+
mispronunced_words = []
|
| 15 |
+
nbest = response_json.get("NBest", [])[0] # Assumes you use the top result
|
| 16 |
+
word_list = nbest.get("Words", [])
|
| 17 |
+
|
| 18 |
+
for word_info in word_list:
|
| 19 |
+
if word_info["PronunciationAssessment"]["ErrorType"] == "Mispronunciation":
|
| 20 |
+
# Collect necessary details
|
| 21 |
+
mispronunced_words.append(
|
| 22 |
+
{
|
| 23 |
+
"word": word_info["Word"],
|
| 24 |
+
"offset": word_info["Offset"],
|
| 25 |
+
"position_in_text": word_list.index(word_info), # Get index for order reference
|
| 26 |
+
}
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
display_text = nbest["Display"]
|
| 30 |
+
|
| 31 |
+
return {"mispronunced_words": mispronunced_words, "display_text": display_text}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def pronunciation_assessment(file_path, language):
|
| 35 |
+
if str(file_path).endswith("mp3"):
|
| 36 |
+
mp3_path = file_path
|
| 37 |
+
file_path = file_path.replace(".mp3", ".wav")
|
| 38 |
+
sound = AudioSegment.from_mp3(mp3_path)
|
| 39 |
+
sound.export(file_path, format="wav")
|
| 40 |
+
|
| 41 |
+
# Initialize speech config
|
| 42 |
+
speech_config = speechsdk.SpeechConfig(
|
| 43 |
+
subscription=settings.AZURE_AI_SUBSCRIPTION_KEY, region=settings.AZURE_AI_REGION
|
| 44 |
+
)
|
| 45 |
+
speech_config.speech_recognition_language = language
|
| 46 |
+
|
| 47 |
+
# Create pronunciation assessment config
|
| 48 |
+
pronunciation_config = speechsdk.PronunciationAssessmentConfig(
|
| 49 |
+
grading_system=speechsdk.PronunciationAssessmentGradingSystem.HundredMark,
|
| 50 |
+
granularity=speechsdk.PronunciationAssessmentGranularity.Phoneme,
|
| 51 |
+
enable_miscue=True,
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# Initialize audio config from file
|
| 55 |
+
audio_config = speechsdk.audio.AudioConfig(filename=file_path)
|
| 56 |
+
|
| 57 |
+
# Initialize speech recognizer with the audio file
|
| 58 |
+
recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
|
| 59 |
+
|
| 60 |
+
# Apply pronunciation assessment configuration
|
| 61 |
+
pronunciation_config.apply_to(recognizer)
|
| 62 |
+
|
| 63 |
+
# Perform recognition and assessment
|
| 64 |
+
result = recognizer.recognize_once()
|
| 65 |
+
|
| 66 |
+
audio_text_details = text_details(json.loads(result.json))
|
| 67 |
+
|
| 68 |
+
if result.reason == speechsdk.ResultReason.RecognizedSpeech:
|
| 69 |
+
# Extract pronunciation assessment results
|
| 70 |
+
pronunciation_result = speechsdk.PronunciationAssessmentResult(result)
|
| 71 |
+
|
| 72 |
+
# Build result dictionary
|
| 73 |
+
analysis_results = {
|
| 74 |
+
"File": file_path,
|
| 75 |
+
"pronunciation_score": pronunciation_result.pronunciation_score,
|
| 76 |
+
"accuracy_score": pronunciation_result.accuracy_score,
|
| 77 |
+
"fluency_score": pronunciation_result.fluency_score,
|
| 78 |
+
"completeness_score": pronunciation_result.completeness_score,
|
| 79 |
+
"mispronunced_words": audio_text_details.get("mispronunced_words", []),
|
| 80 |
+
"display_text": audio_text_details.get("display_text", ""),
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
return analysis_results
|
app/utils/common.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
import spacy
|
| 4 |
+
import speech_recognition as sr
|
| 5 |
+
from loguru import logger
|
| 6 |
+
from pydub import AudioSegment
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
# Load spaCy's English language model for grammar and text analysis
|
| 10 |
+
def get_doc(text):
|
| 11 |
+
try:
|
| 12 |
+
nlp = spacy.load("en_core_web_sm")
|
| 13 |
+
doc = nlp(text)
|
| 14 |
+
return doc
|
| 15 |
+
except OSError as ex:
|
| 16 |
+
logger.exception("Error while getting Spacy doc: ", str(ex))
|
| 17 |
+
logger.info("Please download the model with this command: python -m spacy download en_core_web_sm")
|
| 18 |
+
os.
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# Convert MP3 to WAV using pydub
|
| 22 |
+
def convert_mp3_to_wav(mp3_file, wav_file="converted_comprehension_audio.wav"):
|
| 23 |
+
sound = AudioSegment.from_mp3(mp3_file)
|
| 24 |
+
sound.export(wav_file, format="wav")
|
| 25 |
+
return wav_file
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# Convert WAV to text using SpeechRecognition
|
| 29 |
+
def transcribe_audio(wav_file):
|
| 30 |
+
recognizer = sr.Recognizer()
|
| 31 |
+
with sr.AudioFile(wav_file) as source:
|
| 32 |
+
audio_data = recognizer.record(source)
|
| 33 |
+
text = recognizer.recognize_google(audio_data)
|
| 34 |
+
return text
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def split_audio(audio_path, chunk_length_ms=30000):
|
| 38 |
+
"""Splits audio into chunks of specified length (in milliseconds)."""
|
| 39 |
+
audio = AudioSegment.from_wav(audio_path)
|
| 40 |
+
chunks = [audio[i : i + chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
|
| 41 |
+
chunk_paths = []
|
| 42 |
+
|
| 43 |
+
for idx, chunk in enumerate(chunks):
|
| 44 |
+
chunk_path = f"{audio_path}_chunk_{idx}.wav"
|
| 45 |
+
chunk.export(chunk_path, format="wav")
|
| 46 |
+
chunk_paths.append(chunk_path)
|
| 47 |
+
|
| 48 |
+
return chunk_paths
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def aggregate_scores(chunk_scores):
|
| 52 |
+
"""Aggregate scores across all chunks to produce a final score."""
|
| 53 |
+
final_score = {
|
| 54 |
+
"accuracy_score": 0,
|
| 55 |
+
"fluency_score": 0,
|
| 56 |
+
"completeness_score": 0,
|
| 57 |
+
"pronunciation_score": 0,
|
| 58 |
+
"mispronunced_words": [],
|
| 59 |
+
"display_text": "",
|
| 60 |
+
}
|
| 61 |
+
num_chunks = len(chunk_scores)
|
| 62 |
+
|
| 63 |
+
# Sum each score across chunks
|
| 64 |
+
for score in chunk_scores:
|
| 65 |
+
final_score["accuracy_score"] += score.get("accuracy_score", 0)
|
| 66 |
+
final_score["fluency_score"] += score.get("fluency_score", 0)
|
| 67 |
+
final_score["completeness_score"] += score.get("completeness_score", 0)
|
| 68 |
+
final_score["pronunciation_score"] += score.get("pronunciation_score", 0)
|
| 69 |
+
final_score["mispronunced_words"].extend(score.get("mispronunced_words"))
|
| 70 |
+
final_score["display_text"] += score.get("display_text")
|
| 71 |
+
|
| 72 |
+
# Average each score
|
| 73 |
+
for key in final_score:
|
| 74 |
+
if type(final_score[key]) in [float, int]:
|
| 75 |
+
final_score[key] = round(final_score[key] / num_chunks, 2)
|
| 76 |
+
|
| 77 |
+
return final_score
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def remove_files(file_list):
|
| 81 |
+
try:
|
| 82 |
+
os.system("rm " + " ".join(file_list))
|
| 83 |
+
logger.info("All listed files removed.")
|
| 84 |
+
except Exception as ex:
|
| 85 |
+
logger.exception(f"Error removing files: {ex}")
|
app/utils/comprehension.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import Counter
|
| 2 |
+
|
| 3 |
+
import spacy
|
| 4 |
+
import speech_recognition as sr
|
| 5 |
+
from pydub import AudioSegment
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
# Comprehension Score Evaluation (No Reference Text)
|
| 9 |
+
def evaluate_comprehension(text, doc):
|
| 10 |
+
# 1. Listening Comprehension: Evaluate clarity and coherence of the sentences
|
| 11 |
+
sentence_count = len(list(doc.sents))
|
| 12 |
+
sentence_length_avg = sum([len(sent) for sent in doc.sents]) / sentence_count if sentence_count > 0 else 0
|
| 13 |
+
|
| 14 |
+
if sentence_length_avg <= 10: # Short and simple sentences
|
| 15 |
+
listening_comprehension_score = 10
|
| 16 |
+
elif 10 < sentence_length_avg <= 20: # Moderate length sentences
|
| 17 |
+
listening_comprehension_score = 8
|
| 18 |
+
else: # Longer sentences may indicate less clarity
|
| 19 |
+
listening_comprehension_score = 6
|
| 20 |
+
|
| 21 |
+
# 2. Topic Relevance: Lexical diversity and internal consistency
|
| 22 |
+
word_list = [token.text.lower() for token in doc if token.is_alpha]
|
| 23 |
+
word_count = len(word_list)
|
| 24 |
+
unique_words = len(set(word_list))
|
| 25 |
+
|
| 26 |
+
if word_count == 0:
|
| 27 |
+
topic_relevance_score = 0
|
| 28 |
+
else:
|
| 29 |
+
lexical_diversity = unique_words / word_count
|
| 30 |
+
if lexical_diversity > 0.7: # High diversity, likely more relevant
|
| 31 |
+
topic_relevance_score = 10
|
| 32 |
+
elif 0.4 <= lexical_diversity <= 0.7: # Moderate diversity
|
| 33 |
+
topic_relevance_score = 7
|
| 34 |
+
else: # Low diversity indicates repetition, which may indicate off-topic speech
|
| 35 |
+
topic_relevance_score = 4
|
| 36 |
+
|
| 37 |
+
# 3. Understanding: Based on simplicity and grammatical structure
|
| 38 |
+
understanding_score = listening_comprehension_score # Reuse comprehension score since it's already based on clarity
|
| 39 |
+
|
| 40 |
+
# Final Comprehension Score: (Listening Comprehension + Topic Relevance + Understanding) / 3
|
| 41 |
+
comprehension_score = (listening_comprehension_score + topic_relevance_score + understanding_score) / 3
|
| 42 |
+
|
| 43 |
+
return {
|
| 44 |
+
"listening_comprehension": round(listening_comprehension_score, 2),
|
| 45 |
+
"topic_relevance": round(topic_relevance_score, 2),
|
| 46 |
+
"understanding": round(understanding_score, 2),
|
| 47 |
+
"comprehension_score": round(comprehension_score, 2),
|
| 48 |
+
}
|
app/utils/constants.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
APP_NAME = "Speech Analysis"
|
| 4 |
+
APP_DESCRIPTION = "Speech Analysis ML Inference API"
|
| 5 |
+
APP_VERSION = "v1"
|
| 6 |
+
|
| 7 |
+
API_BASE = "/"
|
| 8 |
+
API_PREFIX = "/api/" + APP_VERSION
|
| 9 |
+
|
| 10 |
+
BASE_PATH = os.path.dirname(os.path.abspath(__file__)).split("/app")[0]
|
| 11 |
+
UPLOAD_DIR = os.path.join(BASE_PATH, "audio")
|
| 12 |
+
STATIC_DIR = os.path.join(BASE_PATH, "app", "static")
|
| 13 |
+
|
| 14 |
+
API_READINESS = "/readiness"
|
| 15 |
+
API_LIVENESS = "/liveness"
|
app/utils/get_scores.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import csv
|
| 2 |
+
import os
|
| 3 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 4 |
+
|
| 5 |
+
from loguru import logger
|
| 6 |
+
|
| 7 |
+
from app.utils.common import convert_mp3_to_wav, get_doc, transcribe_audio
|
| 8 |
+
from app.utils.comprehension import evaluate_comprehension
|
| 9 |
+
from app.utils.grammar import evaluate_grammar
|
| 10 |
+
from app.utils.intonation import evaluate_intonation
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def get_content_score(audio_file):
|
| 14 |
+
if audio_file.endswith("mp3"):
|
| 15 |
+
wav_file = convert_mp3_to_wav(audio_file)
|
| 16 |
+
else:
|
| 17 |
+
wav_file = audio_file
|
| 18 |
+
|
| 19 |
+
# Step 2: Transcribe the WAV to text
|
| 20 |
+
transcript = transcribe_audio(wav_file)
|
| 21 |
+
|
| 22 |
+
doc = get_doc(transcript)
|
| 23 |
+
|
| 24 |
+
with ThreadPoolExecutor() as executor:
|
| 25 |
+
futures = [
|
| 26 |
+
executor.submit(evaluate_intonation, wav_file),
|
| 27 |
+
executor.submit(evaluate_grammar, transcript, doc),
|
| 28 |
+
executor.submit(evaluate_comprehension, transcript, doc),
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
return {
|
| 32 |
+
"intonation_score": round((float(futures[0].result().get("intonation_score")) * 10), 2),
|
| 33 |
+
"grammar_score": round((float(futures[1].result().get("grammar_score")) * 10), 2),
|
| 34 |
+
"grammar_errors": futures[1].result().get("errors"),
|
| 35 |
+
"comprehension_score": round((float(futures[2].result().get("comprehension_score")) * 10), 2),
|
| 36 |
+
}
|
app/utils/grammar.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import spacy
|
| 2 |
+
import speech_recognition as sr
|
| 3 |
+
from pydub import AudioSegment
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Enhanced Sentence Structure Scoring
|
| 7 |
+
def evaluate_sentence_structure(doc):
|
| 8 |
+
sentence_structure_score = 0
|
| 9 |
+
sentence_count = len(list(doc.sents))
|
| 10 |
+
|
| 11 |
+
if sentence_count == 0:
|
| 12 |
+
return 0
|
| 13 |
+
|
| 14 |
+
for sentence in doc.sents:
|
| 15 |
+
# Check sentence length (moderate-length sentences should score higher)
|
| 16 |
+
sentence_length = len(sentence)
|
| 17 |
+
if 10 <= sentence_length <= 20:
|
| 18 |
+
sentence_length_score = 10
|
| 19 |
+
elif 20 < sentence_length <= 30:
|
| 20 |
+
sentence_length_score = 8
|
| 21 |
+
else:
|
| 22 |
+
sentence_length_score = 5 if sentence_length > 30 else 2
|
| 23 |
+
|
| 24 |
+
# Syntactic complexity: Count unique dependency types in the sentence
|
| 25 |
+
unique_dependencies = len(set([token.dep_ for token in sentence if token.dep_ != "punct"]))
|
| 26 |
+
if unique_dependencies > 6: # Complex sentence
|
| 27 |
+
syntax_complexity_score = 10
|
| 28 |
+
elif 3 <= unique_dependencies <= 6: # Moderately complex
|
| 29 |
+
syntax_complexity_score = 7
|
| 30 |
+
else:
|
| 31 |
+
syntax_complexity_score = 4 # Simple sentence
|
| 32 |
+
|
| 33 |
+
# Average the length and syntax scores
|
| 34 |
+
sentence_score = (sentence_length_score + syntax_complexity_score) / 2
|
| 35 |
+
sentence_structure_score += sentence_score
|
| 36 |
+
|
| 37 |
+
# Final average score for all sentences in the document
|
| 38 |
+
return min((sentence_structure_score / sentence_count), 10)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# Grammar scoring based on Sentence Structure, Grammar Usage, and Vocabulary Range
|
| 42 |
+
def evaluate_grammar(text, doc):
|
| 43 |
+
# 1. Sentence Structure
|
| 44 |
+
sentence_structure_score = evaluate_sentence_structure(doc)
|
| 45 |
+
errors = []
|
| 46 |
+
|
| 47 |
+
# 2. Grammar Usage: Use the same logic for now
|
| 48 |
+
# Example:
|
| 49 |
+
# - The boy hungry. # Wrong
|
| 50 |
+
# - The boy is hungry # Correct
|
| 51 |
+
grammar_usage_score = 10 # Assume perfect grammar initially
|
| 52 |
+
for token in doc:
|
| 53 |
+
if token.dep_ == "nsubj" and token.head.pos_ != "VERB":
|
| 54 |
+
errors.append(
|
| 55 |
+
{
|
| 56 |
+
"word": token.text,
|
| 57 |
+
"position_in_text": token.i,
|
| 58 |
+
"error": "Subject without a verb",
|
| 59 |
+
"suggestion": "Ensure the subject is followed by a verb.",
|
| 60 |
+
}
|
| 61 |
+
)
|
| 62 |
+
grammar_usage_score -= 2 # Deduct points for common grammar mistakes
|
| 63 |
+
|
| 64 |
+
# 3. Vocabulary Range
|
| 65 |
+
unique_words = set([token.text.lower() for token in doc if token.is_alpha]) # Get Unique words in lower case
|
| 66 |
+
vocabulary_range_score = min(len(unique_words) / len(doc), 1) * 10 # Lexical diversity
|
| 67 |
+
|
| 68 |
+
# Final Grammar Score using the formula: (Sentence Structure + Grammar Usage + Vocabulary Range) / 3
|
| 69 |
+
grammar_score = (sentence_structure_score + grammar_usage_score + vocabulary_range_score) / 3
|
| 70 |
+
|
| 71 |
+
return {
|
| 72 |
+
"sentence_structure": round(sentence_structure_score, 2),
|
| 73 |
+
"grammar_usage": round(grammar_usage_score, 2),
|
| 74 |
+
"vocabulary_range": round(vocabulary_range_score, 2),
|
| 75 |
+
"grammar_score": round(grammar_score, 2),
|
| 76 |
+
"errors": errors,
|
| 77 |
+
"text": text,
|
| 78 |
+
}
|
app/utils/intonation.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import librosa
|
| 2 |
+
import numpy as np
|
| 3 |
+
from pydub import AudioSegment
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# Feature extraction for Intonation
|
| 7 |
+
def evaluate_intonation(wav_file):
|
| 8 |
+
# Load audio using librosa
|
| 9 |
+
y, sr = librosa.load(wav_file)
|
| 10 |
+
|
| 11 |
+
# 1. Sentence Stress (based on energy)
|
| 12 |
+
# Calculate Root mean square energy which represents the perceived loudness or power of the audio, which can be linked to sentence stress
|
| 13 |
+
rms_energy = librosa.feature.rms(y=y)[0]
|
| 14 |
+
avg_energy = np.mean(rms_energy) * 10 # Scale up for score calculation
|
| 15 |
+
avg_energy = float(avg_energy) # Ensure scalar
|
| 16 |
+
|
| 17 |
+
# 2. Intonation Patterns (based on pitch variation)
|
| 18 |
+
# Estimate the pitch (fundamental frequency) of the audio signal over time.
|
| 19 |
+
pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
|
| 20 |
+
pitch_values = pitches[magnitudes > np.median(magnitudes)]
|
| 21 |
+
if len(pitch_values) > 0:
|
| 22 |
+
pitch_variation = np.std(pitch_values) / np.mean(pitch_values) * 10
|
| 23 |
+
else:
|
| 24 |
+
pitch_variation = 0
|
| 25 |
+
pitch_variation = float(pitch_variation) # Ensure scalar
|
| 26 |
+
|
| 27 |
+
# 3. Rhythm (based on tempo)
|
| 28 |
+
# Estimates the tempo of the audio in beats per minute (BPM) by tracking the rhythmic structure (detecting beats in the signal).
|
| 29 |
+
# Average speaking tempo is ~120 BPM
|
| 30 |
+
tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
|
| 31 |
+
rhythm_score = (tempo / 120) * 10 # Adjust based on average speaking tempo (120 BPM)
|
| 32 |
+
rhythm_score = float(rhythm_score) # Ensure scalar
|
| 33 |
+
|
| 34 |
+
# Average score as per the formula: (Sentence Stress + Intonation Patterns + Rhythm) / 3
|
| 35 |
+
# Normalize the scores before calculating the final score
|
| 36 |
+
sentence_stress_score = min(max(avg_energy, 0), 10) # Ensure it's within [0, 10]
|
| 37 |
+
intonation_patterns_score = min(max(pitch_variation, 0), 10) # Ensure it's within [0, 10]
|
| 38 |
+
rhythm_score = min(max(rhythm_score, 0), 10) # Ensure it's within [0, 10]
|
| 39 |
+
|
| 40 |
+
# Intonation score calculation
|
| 41 |
+
intonation_score = (sentence_stress_score + intonation_patterns_score + rhythm_score) / 3
|
| 42 |
+
|
| 43 |
+
return {
|
| 44 |
+
"sentence_stress": sentence_stress_score,
|
| 45 |
+
"intonation_patterns": intonation_patterns_score,
|
| 46 |
+
"rhythm": rhythm_score,
|
| 47 |
+
"intonation_score": intonation_score,
|
| 48 |
+
}
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: "3.8"
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
web:
|
| 5 |
+
build: .
|
| 6 |
+
command: "uvicorn app.main:server --host 0.0.0.0 --port 7860"
|
| 7 |
+
volumes:
|
| 8 |
+
- .:/app
|
| 9 |
+
ports:
|
| 10 |
+
- "7860:7860"
|
| 11 |
+
env_file:
|
| 12 |
+
- .env
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
pydub
|
| 4 |
+
spacy
|
| 5 |
+
SpeechRecognition==3.10.4
|
| 6 |
+
loguru
|
| 7 |
+
numpy
|
| 8 |
+
librosa
|
| 9 |
+
azure-cognitiveservices-speech
|
| 10 |
+
python-dotenv
|
| 11 |
+
pydantic-settings
|
| 12 |
+
python-multipart
|