readme, makefile, service files
Browse files- Dockerfile +19 -0
- Makefile +134 -0
- README.md +0 -1
- main.py +62 -0
- pages/nav.py +32 -0
- pytest.ini +3 -0
- requirements.txt +22 -0
- src/datasets/preprocess.py +182 -0
- src/logger.py +47 -0
- src/schemas/requests.py +9 -0
- src/schemas/responses.py +22 -0
- task_description.md +80 -0
Dockerfile
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 6 |
+
build-essential \
|
| 7 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 8 |
+
|
| 9 |
+
COPY requirements.txt .
|
| 10 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 11 |
+
|
| 12 |
+
COPY . .
|
| 13 |
+
|
| 14 |
+
RUN useradd -m -u 1000 user
|
| 15 |
+
USER user
|
| 16 |
+
|
| 17 |
+
EXPOSE 7860
|
| 18 |
+
|
| 19 |
+
CMD ["sh", "-c", "uvicorn main:app --host 0.0.0.0 --port 8000 & streamlit run app.py --server.port 7860 --server.address 0.0.0.0 --server.headless true"]
|
Makefile
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
install:
|
| 2 |
+
pip install -r requirements.txt
|
| 3 |
+
|
| 4 |
+
install-runpod:
|
| 5 |
+
pip install --ignore-installed blinker && pip install -r requirements.txt
|
| 6 |
+
|
| 7 |
+
download_tedseg:
|
| 8 |
+
python -m src.datasets.download_tedseg
|
| 9 |
+
|
| 10 |
+
download_pubmed:
|
| 11 |
+
python -m src.datasets.download_pubmed --k 100
|
| 12 |
+
|
| 13 |
+
download_wikipedia:
|
| 14 |
+
python -m src.datasets.download_wikipedia --k 100
|
| 15 |
+
|
| 16 |
+
preprocess_all: preprocess_pubmed preprocess_wikipedia preprocess_gutenberg preprocess_ted
|
| 17 |
+
|
| 18 |
+
preprocess_pubmed:
|
| 19 |
+
python -m src.datasets.preprocess pubmed --input data/pubmed/raw/pubmed_raw.jsonl --output data/pubmed/pubmed_data.jsonl
|
| 20 |
+
|
| 21 |
+
preprocess_wikipedia:
|
| 22 |
+
python -m src.datasets.preprocess wikipedia --input data/wikipedia/raw/wikipedia_raw.jsonl --output data/wikipedia/wikipedia_data.jsonl
|
| 23 |
+
|
| 24 |
+
preprocess_gutenberg:
|
| 25 |
+
python -m src.datasets.preprocess gutenberg --raw_dir data/gutenberg/raw --output data/gutenberg/gutenberg_data.jsonl
|
| 26 |
+
|
| 27 |
+
preprocess_ted:
|
| 28 |
+
python -m src.datasets.preprocess ted --input_dir data/tedseg/raw --output_dir data/tedseg
|
| 29 |
+
|
| 30 |
+
create_recipes:
|
| 31 |
+
python -m src.datasets.create_recipes_dataset
|
| 32 |
+
|
| 33 |
+
sentence_split_all: sentence_split_gutenberg sentence_split_pubmed sentence_split_wikipedia sentence_split_recipes
|
| 34 |
+
|
| 35 |
+
sentence_split_gutenberg:
|
| 36 |
+
python -m src.datasets.sentence_splitter gutenberg --input data/gutenberg/gutenberg_data.jsonl --output data/gutenberg/gutenberg_sentences.jsonl --device cuda
|
| 37 |
+
|
| 38 |
+
sentence_split_ted:
|
| 39 |
+
python -m src.datasets.sentence_splitter ted --input_dir data/tedseg/raw --output_dir data/tedseg --device cuda
|
| 40 |
+
|
| 41 |
+
sentence_split_pubmed:
|
| 42 |
+
python -m src.datasets.sentence_splitter pubmed --input data/pubmed/pubmed_data.jsonl --output data/pubmed/pubmed_sentences.jsonl --device cuda
|
| 43 |
+
|
| 44 |
+
sentence_split_wikipedia:
|
| 45 |
+
python -m src.datasets.sentence_splitter wikipedia --input data/wikipedia/wikipedia_data.jsonl --output data/wikipedia/wikipedia_sentences.jsonl --device cuda
|
| 46 |
+
|
| 47 |
+
sentence_split_recipes:
|
| 48 |
+
python -m src.datasets.sentence_splitter recipes --input data/recipes/recipes_data.jsonl --output data/recipes/recipes_sentences.jsonl --device cuda
|
| 49 |
+
|
| 50 |
+
build_recipes_pairs:
|
| 51 |
+
python -m src.datasets.build_pairs recipes
|
| 52 |
+
|
| 53 |
+
train_distilbert:
|
| 54 |
+
python -m src.models.train --model distilbert --out checkpoints/distilbert
|
| 55 |
+
|
| 56 |
+
train_bert:
|
| 57 |
+
python -m src.models.train --model bert --out checkpoints/bert
|
| 58 |
+
|
| 59 |
+
train_deberta:
|
| 60 |
+
python -m src.models.train --model deberta --out checkpoints/deberta
|
| 61 |
+
|
| 62 |
+
train_all: train_distilbert train_bert train_deberta
|
| 63 |
+
|
| 64 |
+
export_distilbert:
|
| 65 |
+
python -m src.models.export_and_download --checkpoint checkpoints/distilbert/best
|
| 66 |
+
|
| 67 |
+
export_bert:
|
| 68 |
+
python -m src.models.export_and_download --checkpoint checkpoints/bert/best
|
| 69 |
+
|
| 70 |
+
export_deberta:
|
| 71 |
+
python -m src.models.export_and_download --checkpoint checkpoints/deberta/best
|
| 72 |
+
|
| 73 |
+
export_all: export_distilbert export_bert export_deberta
|
| 74 |
+
|
| 75 |
+
upload_distilbert:
|
| 76 |
+
python -m src.models.export_and_download --checkpoint checkpoints/distilbert/best --repo slamos/bc-models-distilbert
|
| 77 |
+
|
| 78 |
+
upload_bert:
|
| 79 |
+
python -m src.models.export_and_download --checkpoint checkpoints/bert/best --repo slamos/bc-models-bert
|
| 80 |
+
|
| 81 |
+
upload_deberta:
|
| 82 |
+
python -m src.models.export_and_download --checkpoint checkpoints/deberta/best --repo slamos/bc-models-deberta
|
| 83 |
+
|
| 84 |
+
upload_all: upload_distilbert upload_bert upload_deberta
|
| 85 |
+
|
| 86 |
+
download_distilbert:
|
| 87 |
+
python -m src.models.export_and_download --download distilbert
|
| 88 |
+
|
| 89 |
+
download_bert:
|
| 90 |
+
python -m src.models.export_and_download --download bert
|
| 91 |
+
|
| 92 |
+
download_deberta:
|
| 93 |
+
python -m src.models.export_and_download --download deberta
|
| 94 |
+
|
| 95 |
+
download_all: download_distilbert download_bert download_deberta
|
| 96 |
+
|
| 97 |
+
inference_distilbert:
|
| 98 |
+
python -m src.models.inference --model distilbert
|
| 99 |
+
|
| 100 |
+
inference_bert:
|
| 101 |
+
python -m src.models.inference --model bert
|
| 102 |
+
|
| 103 |
+
inference_deberta:
|
| 104 |
+
python -m src.models.inference --model deberta
|
| 105 |
+
|
| 106 |
+
inference_local_distilbert:
|
| 107 |
+
python -m src.models.inference --model distilbert --local
|
| 108 |
+
|
| 109 |
+
inference_local_bert:
|
| 110 |
+
python -m src.models.inference --model bert --local
|
| 111 |
+
|
| 112 |
+
inference_local_deberta:
|
| 113 |
+
python -m src.models.inference --model deberta --local
|
| 114 |
+
|
| 115 |
+
test-api:
|
| 116 |
+
python -m pytest tests/test_api.py -v
|
| 117 |
+
|
| 118 |
+
test-dataset:
|
| 119 |
+
python -m pytest tests/test_dataset.py -v
|
| 120 |
+
|
| 121 |
+
test-pipelines:
|
| 122 |
+
python -m pytest tests/test_pipelines.py -v
|
| 123 |
+
|
| 124 |
+
test-all:
|
| 125 |
+
python -m pytest tests/ -v
|
| 126 |
+
|
| 127 |
+
clean:
|
| 128 |
+
rm -rf checkpoints/distilbert checkpoints/bert checkpoints/deberta checkpoints/plots
|
| 129 |
+
|
| 130 |
+
run-be:
|
| 131 |
+
uvicorn main:app --host 0.0.0.0 --port 8000
|
| 132 |
+
|
| 133 |
+
run-fe:
|
| 134 |
+
streamlit run app.py
|
README.md
CHANGED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
# bottlecap-ml-dev-test
|
|
|
|
|
|
main.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from contextlib import asynccontextmanager
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
from fastapi import FastAPI
|
| 7 |
+
|
| 8 |
+
from src.api import fix_newlines, fix_newlines_all_models, health
|
| 9 |
+
from src.models.export_and_download import download_all_models
|
| 10 |
+
from src.pipelines.all_models_pipeline import AllModelsPipeline
|
| 11 |
+
from src.pipelines.one_model_pipeline import OneModelPipeline
|
| 12 |
+
from src.pipelines.sat_loader import load_sat
|
| 13 |
+
|
| 14 |
+
load_dotenv()
|
| 15 |
+
log = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@asynccontextmanager
|
| 19 |
+
async def lifespan(app: FastAPI):
|
| 20 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 21 |
+
log.info(f"Loading SAT-3L-SM on {device} ...")
|
| 22 |
+
sat = load_sat(device=device)
|
| 23 |
+
|
| 24 |
+
downloaded = download_all_models()
|
| 25 |
+
|
| 26 |
+
# OneModelPipeline — uses bert
|
| 27 |
+
bert_dir = downloaded.get("bert")
|
| 28 |
+
if bert_dir and (bert_dir / "model.onnx").exists():
|
| 29 |
+
log.info("Loading OneModelPipeline (bert)")
|
| 30 |
+
app.state.one_model_pipeline = OneModelPipeline(
|
| 31 |
+
onnx_path=bert_dir / "model.onnx",
|
| 32 |
+
tokenizer_path=bert_dir,
|
| 33 |
+
sat_model=sat,
|
| 34 |
+
model_name="bert",
|
| 35 |
+
)
|
| 36 |
+
else:
|
| 37 |
+
log.warning("bert not available — OneModelPipeline disabled")
|
| 38 |
+
app.state.one_model_pipeline = None
|
| 39 |
+
|
| 40 |
+
# AllModelsPipeline — uses all downloaded models
|
| 41 |
+
all_pipeline = AllModelsPipeline(sat_model=sat)
|
| 42 |
+
for name, local_dir in downloaded.items():
|
| 43 |
+
onnx_path = local_dir / "model.onnx"
|
| 44 |
+
if onnx_path.exists():
|
| 45 |
+
log.info(f"Adding {name} to AllModelsPipeline")
|
| 46 |
+
all_pipeline.add_model(name, onnx_path, local_dir)
|
| 47 |
+
else:
|
| 48 |
+
log.warning(f"Skipping {name}: model.onnx not found in {local_dir}")
|
| 49 |
+
app.state.all_models_pipeline = all_pipeline
|
| 50 |
+
|
| 51 |
+
yield
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
app = FastAPI(
|
| 55 |
+
title="Newline Fixer API",
|
| 56 |
+
description="ML service for fixing newline placement in English text",
|
| 57 |
+
lifespan=lifespan,
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
app.include_router(health.router)
|
| 61 |
+
app.include_router(fix_newlines.router)
|
| 62 |
+
app.include_router(fix_newlines_all_models.router)
|
pages/nav.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared navigation stepper for all pages."""
|
| 2 |
+
import streamlit as st
|
| 3 |
+
|
| 4 |
+
PAGES = [
|
| 5 |
+
("Home", "pages/home.py"),
|
| 6 |
+
("Config", "pages/config.py"),
|
| 7 |
+
("Result", "pages/result.py"),
|
| 8 |
+
]
|
| 9 |
+
|
| 10 |
+
AVAILABLE_MODELS = ["rule_based", "distilbert", "bert", "deberta"]
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def show_stepper(current: str):
|
| 14 |
+
"""Render a horizontal step indicator. `current` is the page title."""
|
| 15 |
+
cols = st.columns(len(PAGES))
|
| 16 |
+
for i, (title, path) in enumerate(PAGES):
|
| 17 |
+
with cols[i]:
|
| 18 |
+
if title == current:
|
| 19 |
+
st.markdown(
|
| 20 |
+
f"<div style='text-align:center; padding:8px 0; "
|
| 21 |
+
f"border-bottom:3px solid #ff4b4b; font-weight:700;'>"
|
| 22 |
+
f"{i + 1}. {title}</div>",
|
| 23 |
+
unsafe_allow_html=True,
|
| 24 |
+
)
|
| 25 |
+
else:
|
| 26 |
+
st.markdown(
|
| 27 |
+
f"<div style='text-align:center; padding:8px 0; "
|
| 28 |
+
f"border-bottom:3px solid #444; color:#888;'>"
|
| 29 |
+
f"{i + 1}. {title}</div>",
|
| 30 |
+
unsafe_allow_html=True,
|
| 31 |
+
)
|
| 32 |
+
st.write("")
|
pytest.ini
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[pytest]
|
| 2 |
+
filterwarnings =
|
| 3 |
+
ignore:split_on_input_newlines=False:UserWarning
|
requirements.txt
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.121.0
|
| 2 |
+
uvicorn>=0.34.0
|
| 3 |
+
pydantic>=2.12.4
|
| 4 |
+
torch>=2.6.0
|
| 5 |
+
torchvision>=0.21.0
|
| 6 |
+
transformers>=4.50.3
|
| 7 |
+
datasets>=2.21.0
|
| 8 |
+
tqdm>=4.67.3
|
| 9 |
+
pandas>=1.4.0
|
| 10 |
+
requests>=2.32.5
|
| 11 |
+
python-dotenv>=1.2.1
|
| 12 |
+
wtpsplit>=2.2.0
|
| 13 |
+
sentencepiece>=0.2.0
|
| 14 |
+
onnx>=1.16.0
|
| 15 |
+
onnxruntime>=1.18.0
|
| 16 |
+
onnxscript>=0.1.0
|
| 17 |
+
huggingface_hub>=0.23.0
|
| 18 |
+
matplotlib
|
| 19 |
+
accelerate>=0.26.0
|
| 20 |
+
wandb>=0.17.0
|
| 21 |
+
streamlit>=1.35.0
|
| 22 |
+
pytest>=8.0.0
|
src/datasets/preprocess.py
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def preprocess_gutenberg(raw_dir: str, output_path: str):
|
| 8 |
+
"""
|
| 9 |
+
Read all files from raw_dir (Gutenberg books), split each into paragraphs,
|
| 10 |
+
and write a single JSONL file where each line is one book:
|
| 11 |
+
|
| 12 |
+
{"file_name": "3046.ref", "text": ["paragraph 1", "paragraph 2", ...]}
|
| 13 |
+
|
| 14 |
+
Paragraph boundaries are determined by:
|
| 15 |
+
- double newlines (\\n\\n)
|
| 16 |
+
- lines starting with capitalised text, a number, or ==========
|
| 17 |
+
"""
|
| 18 |
+
raw = Path(raw_dir)
|
| 19 |
+
out = Path(output_path)
|
| 20 |
+
out.parent.mkdir(parents=True, exist_ok=True)
|
| 21 |
+
|
| 22 |
+
n_books = 0
|
| 23 |
+
with open(out, "w", encoding="utf-8") as fout:
|
| 24 |
+
for filepath in sorted(raw.iterdir()):
|
| 25 |
+
if filepath.is_dir():
|
| 26 |
+
continue
|
| 27 |
+
text = filepath.read_text(encoding="utf-8")
|
| 28 |
+
|
| 29 |
+
# split on ========== into sections
|
| 30 |
+
sections = re.split(r"\n*==========\n*", text)
|
| 31 |
+
|
| 32 |
+
paragraphs = []
|
| 33 |
+
for section in sections:
|
| 34 |
+
section = section.strip()
|
| 35 |
+
if not section:
|
| 36 |
+
continue
|
| 37 |
+
# first line is the heading, rest is body text
|
| 38 |
+
first_newline = section.find("\n")
|
| 39 |
+
if first_newline == -1:
|
| 40 |
+
# section is just a heading with no body
|
| 41 |
+
paragraphs.append(section)
|
| 42 |
+
else:
|
| 43 |
+
heading = section[:first_newline].strip()
|
| 44 |
+
body = section[first_newline + 1:].strip()
|
| 45 |
+
if heading:
|
| 46 |
+
paragraphs.append(heading)
|
| 47 |
+
if body:
|
| 48 |
+
paragraphs.append(body)
|
| 49 |
+
|
| 50 |
+
if not paragraphs:
|
| 51 |
+
continue
|
| 52 |
+
|
| 53 |
+
record = {"file_name": filepath.name, "text": paragraphs}
|
| 54 |
+
fout.write(json.dumps(record, ensure_ascii=False) + "\n")
|
| 55 |
+
n_books += 1
|
| 56 |
+
|
| 57 |
+
print(f"Wrote {n_books:,} books to {out}")
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def preprocess_ted(input_dir: str, output_dir: str, max_paragraphs: int = 60):
|
| 61 |
+
"""
|
| 62 |
+
Process all TED split files (train.jsonl, val.jsonl, test.jsonl) in input_dir,
|
| 63 |
+
drop samples with more than max_paragraphs paragraphs, and write each as
|
| 64 |
+
{name}_preprocessed.jsonl into output_dir.
|
| 65 |
+
"""
|
| 66 |
+
inp = Path(input_dir)
|
| 67 |
+
out = Path(output_dir)
|
| 68 |
+
out.mkdir(parents=True, exist_ok=True)
|
| 69 |
+
|
| 70 |
+
for split_file in sorted(inp.glob("*.jsonl")):
|
| 71 |
+
out_file = out / f"{split_file.stem}_preprocessed.jsonl"
|
| 72 |
+
n_kept = 0
|
| 73 |
+
n_dropped = 0
|
| 74 |
+
with open(split_file, encoding="utf-8") as fin, open(out_file, "w", encoding="utf-8") as fout:
|
| 75 |
+
for line in fin:
|
| 76 |
+
line = line.strip()
|
| 77 |
+
if not line:
|
| 78 |
+
continue
|
| 79 |
+
obj = json.loads(line)
|
| 80 |
+
paragraphs = obj.get("text", [])
|
| 81 |
+
if len(paragraphs) > max_paragraphs:
|
| 82 |
+
n_dropped += 1
|
| 83 |
+
continue
|
| 84 |
+
fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
|
| 85 |
+
n_kept += 1
|
| 86 |
+
print(f"{split_file.name}: kept {n_kept:,}, dropped {n_dropped:,} with >{max_paragraphs} paragraphs → {out_file}")
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def preprocess_jsonl_paragraphs(input_path: str, output_path: str, text_field: str = "text"):
|
| 90 |
+
"""
|
| 91 |
+
Read a raw JSONL file, split the text field on double newlines into
|
| 92 |
+
a list of paragraphs, and write the result as a new JSONL file.
|
| 93 |
+
|
| 94 |
+
Output format per line:
|
| 95 |
+
{"document_idx": 0, "text": ["paragraph 1", "paragraph 2", ...], ...}
|
| 96 |
+
"""
|
| 97 |
+
inp = Path(input_path)
|
| 98 |
+
out = Path(output_path)
|
| 99 |
+
out.parent.mkdir(parents=True, exist_ok=True)
|
| 100 |
+
|
| 101 |
+
n_docs = 0
|
| 102 |
+
with open(inp, encoding="utf-8") as fin, open(out, "w", encoding="utf-8") as fout:
|
| 103 |
+
for line in fin:
|
| 104 |
+
line = line.strip()
|
| 105 |
+
if not line:
|
| 106 |
+
continue
|
| 107 |
+
obj = json.loads(line)
|
| 108 |
+
raw_text = obj.get(text_field, "")
|
| 109 |
+
paragraphs = [p.strip() for p in raw_text.split("\n\n") if p.strip()]
|
| 110 |
+
obj[text_field] = paragraphs
|
| 111 |
+
fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
|
| 112 |
+
n_docs += 1
|
| 113 |
+
|
| 114 |
+
print(f"Wrote {n_docs:,} documents to {out}")
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def _strip_yaml_frontmatter(text: str) -> str:
|
| 118 |
+
"""Remove YAML frontmatter (--- ... ---) from the beginning of text."""
|
| 119 |
+
if text.startswith("---"):
|
| 120 |
+
end = text.find("\n---", 3)
|
| 121 |
+
if end != -1:
|
| 122 |
+
text = text[end + 4:].lstrip("\n")
|
| 123 |
+
return text
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def preprocess_pubmed(input_path: str, output_path: str):
|
| 127 |
+
inp = Path(input_path)
|
| 128 |
+
out = Path(output_path)
|
| 129 |
+
out.parent.mkdir(parents=True, exist_ok=True)
|
| 130 |
+
|
| 131 |
+
n_docs = 0
|
| 132 |
+
with open(inp, encoding="utf-8") as fin, open(out, "w", encoding="utf-8") as fout:
|
| 133 |
+
for line in fin:
|
| 134 |
+
line = line.strip()
|
| 135 |
+
if not line:
|
| 136 |
+
continue
|
| 137 |
+
obj = json.loads(line)
|
| 138 |
+
raw_text = _strip_yaml_frontmatter(obj.get("text", ""))
|
| 139 |
+
paragraphs = [p.strip() for p in raw_text.split("\n\n") if p.strip()]
|
| 140 |
+
obj["text"] = paragraphs
|
| 141 |
+
fout.write(json.dumps(obj, ensure_ascii=False) + "\n")
|
| 142 |
+
n_docs += 1
|
| 143 |
+
|
| 144 |
+
print(f"Wrote {n_docs:,} documents to {out}")
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def preprocess_wikipedia(input_path: str, output_path: str):
|
| 148 |
+
preprocess_jsonl_paragraphs(input_path, output_path, text_field="text")
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
if __name__ == "__main__":
|
| 152 |
+
parser = argparse.ArgumentParser(description="Preprocess raw data into JSONL.")
|
| 153 |
+
sub = parser.add_subparsers(dest="method", required=True)
|
| 154 |
+
|
| 155 |
+
gut = sub.add_parser("gutenberg", help="Convert raw Gutenberg books to JSONL")
|
| 156 |
+
gut.add_argument("--raw_dir", required=True, help="Directory with raw Gutenberg text files")
|
| 157 |
+
gut.add_argument("--output", required=True, help="Output JSONL path")
|
| 158 |
+
|
| 159 |
+
ted = sub.add_parser("ted", help="Filter TED talks JSONL by paragraph count")
|
| 160 |
+
ted.add_argument("--input_dir", required=True, help="Directory with TED split JSONL files")
|
| 161 |
+
ted.add_argument("--output_dir", required=True, help="Output directory for preprocessed files")
|
| 162 |
+
ted.add_argument("--max_paragraphs", type=int, default=60,
|
| 163 |
+
help="Drop talks with more than this many paragraphs (default: 60)")
|
| 164 |
+
|
| 165 |
+
pub = sub.add_parser("pubmed", help="Split PubMed text into paragraphs")
|
| 166 |
+
pub.add_argument("--input", required=True, help="Input raw JSONL path")
|
| 167 |
+
pub.add_argument("--output", required=True, help="Output JSONL path")
|
| 168 |
+
|
| 169 |
+
wiki = sub.add_parser("wikipedia", help="Split Wikipedia text into paragraphs")
|
| 170 |
+
wiki.add_argument("--input", required=True, help="Input raw JSONL path")
|
| 171 |
+
wiki.add_argument("--output", required=True, help="Output JSONL path")
|
| 172 |
+
|
| 173 |
+
args = parser.parse_args()
|
| 174 |
+
|
| 175 |
+
if args.method == "gutenberg":
|
| 176 |
+
preprocess_gutenberg(args.raw_dir, args.output)
|
| 177 |
+
elif args.method == "ted":
|
| 178 |
+
preprocess_ted(args.input_dir, args.output_dir, args.max_paragraphs)
|
| 179 |
+
elif args.method == "pubmed":
|
| 180 |
+
preprocess_pubmed(args.input, args.output)
|
| 181 |
+
elif args.method == "wikipedia":
|
| 182 |
+
preprocess_wikipedia(args.input, args.output)
|
src/logger.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Betterstack logging for API endpoint calls."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
from datetime import datetime, timezone
|
| 6 |
+
|
| 7 |
+
import requests
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
BETTERSTACK_URL = "https://in.logs.betterstack.com"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def log_to_betterstack(
|
| 14 |
+
endpoint_name: str,
|
| 15 |
+
original_text: str,
|
| 16 |
+
formatted_text: str | dict,
|
| 17 |
+
model_name: str,
|
| 18 |
+
time_elapsed: float,
|
| 19 |
+
) -> None:
|
| 20 |
+
token = os.environ.get("BETTERSTACK_SOURCE_TOKEN")
|
| 21 |
+
if not token:
|
| 22 |
+
return
|
| 23 |
+
|
| 24 |
+
payload = {
|
| 25 |
+
"dt": datetime.now(timezone.utc).isoformat(),
|
| 26 |
+
"message": f"Endpoint called: {endpoint_name}",
|
| 27 |
+
"endpoint_name": endpoint_name,
|
| 28 |
+
"model_name": model_name,
|
| 29 |
+
"original_text": original_text,
|
| 30 |
+
"formatted_text": formatted_text,
|
| 31 |
+
"time_elapsed": round(time_elapsed, 4),
|
| 32 |
+
"request_date": datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
|
| 33 |
+
"level": "info"
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
requests.post(
|
| 38 |
+
BETTERSTACK_URL,
|
| 39 |
+
json=payload,
|
| 40 |
+
headers={
|
| 41 |
+
"Authorization": f"Bearer {token}",
|
| 42 |
+
"Content-Type": "application/json",
|
| 43 |
+
},
|
| 44 |
+
timeout=5,
|
| 45 |
+
)
|
| 46 |
+
except Exception:
|
| 47 |
+
pass
|
src/schemas/requests.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class FixNewlinesRequest(BaseModel):
|
| 5 |
+
text: str = Field(..., description="Input text with broken newline placement")
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class FixNewlinesAllModelsRequest(BaseModel):
|
| 9 |
+
text: str = Field(..., description="Input text with broken newline placement")
|
src/schemas/responses.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class FixNewlinesResponse(BaseModel):
|
| 5 |
+
fixed_text: str = Field(..., description="Text with corrected newline placement")
|
| 6 |
+
model_used: str = Field(..., description="Name of the model that produced the result")
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ModelResult(BaseModel):
|
| 10 |
+
model_name: str
|
| 11 |
+
fixed_text: str
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class FixNewlinesAllModelsResponse(BaseModel):
|
| 15 |
+
results: list[ModelResult] = Field(
|
| 16 |
+
..., description="Results from all available models"
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class HealthResponse(BaseModel):
|
| 21 |
+
status: str
|
| 22 |
+
available_models: list[str]
|
task_description.md
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Applied ML Engineer Challenge
|
| 2 |
+
|
| 3 |
+

|
| 4 |
+
|
| 5 |
+
Hey there! Are you interested in LLMs? Do you like building real products, experimenting with neural networks, implementing different ideas and testing them out? Would you like to do that for a living? Then you're in the right place!
|
| 6 |
+
This is an official test for people interested in joining [BottleCapAI](https://www.bottlecapai.com).
|
| 7 |
+
|
| 8 |
+
---
|
| 9 |
+
|
| 10 |
+
## Objective
|
| 11 |
+
|
| 12 |
+
Design and implement a Machine Learning service capable of fixing newline placement in English natural language text. Develop a model,
|
| 13 |
+
and a service with a HTTP API. You can choose any architecture and method of obtaining the model, and any Python framework for the HTTP API.
|
| 14 |
+
It is not necessary to aim for state-of-the-art performance, but the service should efficiently generate reasonable answers.
|
| 15 |
+
|
| 16 |
+
Examples:
|
| 17 |
+
```
|
| 18 |
+
3.2.3 Applications of Attention
|
| 19 |
+
in our Model The Transformer uses multi-head attention in three different ways: • In "encoder-decoder attention" layers,
|
| 20 |
+
the que
|
| 21 |
+
ries come from the previous decoder layer.
|
| 22 |
+
• In "encoder-only attention" layers,
|
| 23 |
+
the que
|
| 24 |
+
ries come from the previous decoder layer.[...]
|
| 25 |
+
```
|
| 26 |
+
->
|
| 27 |
+
```
|
| 28 |
+
3.2.3 Applications of Attention in our Model
|
| 29 |
+
|
| 30 |
+
The Transformer uses multi-head attention in three different ways:
|
| 31 |
+
• In "encoder-decoder attention" layers, the queries come from the previous decoder layer.
|
| 32 |
+
[...]
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
For your solution, it is required to:
|
| 36 |
+
- Implement an API with at least one endpoint that accepts a text input and returns a text with fixed newlines.
|
| 37 |
+
- Provide a Dockerfile with the environment to run the service.
|
| 38 |
+
- Write tests for the service.
|
| 39 |
+
- Compute appropriate metrics for evaluating the performance of the model.
|
| 40 |
+
- Provide a `report.md` with instructions on how to run the service, explaining your approach, decisions taken, and reporting the results.
|
| 41 |
+
- [optional] Ideally, we would encourage you to provide us with a link to the service deployed, for instance on [Huggingface Spaces](https://huggingface.co/spaces), with a minimal UI
|
| 42 |
+
for testing the model (can be in [Streamlit](https://github.com/streamlit/streamlit) or basic HTML + js). Provide the space link in your report.
|
| 43 |
+
|
| 44 |
+
In case you need access to GPUs for developing your model, we recommend using free online solutions, such as [Google Colab](https://colab.research.google.com/),
|
| 45 |
+
[Modal notebooks](https://modal.com/products/notebooks), or [Kaggle kernels](https://www.kaggle.com/kernels).
|
| 46 |
+
It is fine to focus on smaller models in case of any hardware-related difficulties, as we are not expecting SOTA performance.
|
| 47 |
+
|
| 48 |
+
---
|
| 49 |
+
|
| 50 |
+
## What's the point?
|
| 51 |
+
|
| 52 |
+
We are interested in your drive, interest in ML and ability to create useful working products with it, adapting models to certain tasks,
|
| 53 |
+
skill in developing, evaluating, monitoring, and deploying production ready ML services, and general programming skills.
|
| 54 |
+
|
| 55 |
+
---
|
| 56 |
+
|
| 57 |
+
## Submission
|
| 58 |
+
|
| 59 |
+
To submit your results, run:
|
| 60 |
+
```bash
|
| 61 |
+
git bundle create <first name>-<last name>.bundle --all
|
| 62 |
+
```
|
| 63 |
+
Then send us your .bundle file to hey(at)bottlecapai.com with subject in format: \<first name\>-\<last name\>-applied-ml-test\>.
|
| 64 |
+
|
| 65 |
+
At this moment, we are interested mainly in candidates willing to relocate to Prague and authorized to work in the EU. (If you are an exceptional fit, we are happy to discuss possible support options).
|
| 66 |
+
|
| 67 |
+
---
|
| 68 |
+
## 📌 About BottleCapAI
|
| 69 |
+
|
| 70 |
+
At **BottleCapAI**, we’re making large language models **radically more efficient** — aiming for **100× improvements** over today’s approaches. 🚀
|
| 71 |
+
|
| 72 |
+
### 👥 Founders
|
| 73 |
+
- Tomas Mikolov – creator of *word2vec*, pioneer of neural language models.
|
| 74 |
+
- Jaroslav Beck – co-founder of *Beat Games* (*Beat Saber*, 10M+ copies sold, acquired by Meta).
|
| 75 |
+
- David Herel – creator of Thinking Tokens, co-founder of an AI trading startup, and Amazon Alexa Prize finalist.
|
| 76 |
+
|
| 77 |
+
### 🌍 Our vision
|
| 78 |
+
Training frontier LLMs costs **tens of millions** today. Our new algorithms already cut that by **~50%** — and we’re just getting started. We’re building a European hub to push AI forward through **algorithms, not brute force**.
|
| 79 |
+
|
| 80 |
+
📧 **hey(at)bottlecapai.com** · 🌐 [bottlecapai.com](https://www.bottlecapai.com)
|