| import re |
|
|
| from datasets import load_dataset |
| from deepmultilingualpunctuation import PunctuationModel |
| from multiprocess import set_start_method |
|
|
| from nltk.tokenize import word_tokenize, sent_tokenize |
| from nltk.tag import pos_tag |
|
|
| import nltk |
| import spacy |
|
|
| |
|
|
| |
|
|
| model = PunctuationModel() |
|
|
|
|
| ds = load_dataset("ylacombe/mls-eng-tags", split = "train", num_proc=16) |
|
|
| def truecasing_by_pos(input_text): |
| |
| |
| sent_texts = sent_tokenize(input_text) |
| |
| full_text = "" |
|
|
| for sent_text in sent_texts: |
| |
| words = word_tokenize(sent_text) |
|
|
| |
| tagged_words = pos_tag([word.lower() for word in words]) |
| |
| |
| capitalized_words = [w.capitalize() if t in ["NNP","NNPS"] else w for (w,t) in tagged_words] |
| |
| |
| capitalized_words[0] = capitalized_words[0].capitalize() |
| |
| |
| text_truecase = " ".join(capitalized_words) |
|
|
| full_text += text_truecase.strip() |
|
|
| return full_text.strip() |
|
|
| def true_case(text): |
| |
| sentences = nltk.sent_tokenize(text) |
|
|
| |
| true_cased_sentences = [] |
| for sentence in sentences: |
| |
| tokens = nltk.word_tokenize(sentence) |
|
|
| |
| tagged = nltk.pos_tag(tokens) |
|
|
| |
| for i, (word, tag) in enumerate(tagged): |
| if i == 0 or tag in ('NNP', 'NNPS'): |
| tagged[i] = (word.capitalize(), tag) |
|
|
| |
| true_cased_sentence = ' '.join(word for word, tag in tagged) |
|
|
| |
| true_cased_sentence = re.sub(r'(\w) (\W)', r'\1\2', true_cased_sentence) |
|
|
| true_cased_sentences.append(true_cased_sentence) |
|
|
| |
| true_cased_text = ' '.join(true_cased_sentences) |
|
|
| return true_cased_text |
|
|
| spacy.require_gpu(gpu_id=2) |
|
|
| |
| nlp = spacy.load('en_core_web_trf') |
|
|
| from spacy.util import compile_infix_regex |
|
|
| def custom_tokenizer(nlp): |
| infixes = nlp.Defaults.infixes + ['\w+(?:-\w+)+'] |
| infix_regex = compile_infix_regex(infixes) |
| return spacy.tokenizer.Tokenizer(nlp.vocab, infix_finditer=infix_regex.finditer) |
|
|
| |
| nlp.tokenizer = custom_tokenizer(nlp) |
|
|
| def true_case_spacy(text): |
| |
| doc = nlp(text) |
| |
| |
| true_cased_sentences = [] |
| |
| |
| for sent in doc.sents: |
| |
| processed_tokens = [] |
| |
| |
| for i, token in enumerate(sent): |
| |
| if i == 0 or token.pos_ == 'PROPN': |
| processed_tokens.append(token.text.capitalize()) |
| else: |
| processed_tokens.append(token.text) |
| |
| |
| processed_sentence = ' '.join(processed_tokens) |
| |
| |
| processed_sentence = re.sub(r'(\w) (\W)', r'\1\2', processed_sentence) |
| |
| |
| true_cased_sentences.append(processed_sentence) |
| |
| |
| true_cased_text = ' '.join(true_cased_sentences) |
| |
| return true_cased_text |
|
|
|
|
| def repunctuation_apply_simple(batch): |
| |
| repunct_sample = model.restore_punctuation(batch["text"]) |
| batch["repunct_text"] = true_case_spacy(repunct_sample) |
|
|
| return batch |
|
|
| if __name__ == "__main__": |
| set_start_method("spawn") |
| repunct_ds = ds.map(repunctuation_apply_simple, batch_size=1, num_proc=14) |
| repunct_ds.push_to_hub("reach-vb/mls-eng-tags-spacy-v2", split = "train") |
|
|