| | |
| | import kenlm |
| | from datasets import load_dataset |
| | from tqdm import tqdm |
| | import pandas as pd |
| | import numpy as np |
| | from sentence_transformers import SentenceTransformer |
| |
|
| |
|
| | TOTAL_SENTENCES = 20000 |
| | def pp(log_score, length): |
| | return 10.0 ** (-log_score / length) |
| |
|
| |
|
| | embedder = "distiluse-base-multilingual-cased-v1" |
| | embedder_model = SentenceTransformer(embedder) |
| | embedding_shape = embedder_model.encode(["foo"])[0].shape[0] |
| | |
| | model = kenlm.Model("es.arpa.bin") |
| | mc4 = load_dataset("mc4", "es", streaming=True) |
| | count = 0 |
| | embeddings = [] |
| | lenghts = [] |
| | perplexities = [] |
| | sentences = [] |
| |
|
| | for sample in tqdm(mc4["train"].shuffle(buffer_size=100_000), total=416057992): |
| | lines = sample["text"].split("\n") |
| | for line in lines: |
| | count += 1 |
| | log_score = model.score(line) |
| | length = len(line.split()) + 1 |
| | embedding = embedder_model.encode([line])[0] |
| | embeddings.append(embedding.tolist()) |
| | perplexities.append(pp(log_score, length)) |
| | lenghts.append(length) |
| | sentences.append(line) |
| | if count == TOTAL_SENTENCES: |
| | break |
| | if count == TOTAL_SENTENCES: |
| | embeddings = np.array(embeddings) |
| | df = pd.DataFrame({"sentence": sentences, "length": lenghts, "perplexity": perplexities}) |
| | for dim in range(embedding_shape): |
| | df[f"dim_{dim}"] = embeddings[:, dim] |
| | df.to_csv("mc4-es-perplexity-sentences.tsv", index=None, sep="\t") |
| | print("DONE!") |
| | break |
| |
|