from __future__ import annotations import os import random from pathlib import Path from filelock import FileLock import simdjson as json from tqdm import tqdm from tokenizers.models import BPE from tokenizers import Tokenizer, pre_tokenizers, Regex from tokenizers.pre_tokenizers import ByteLevel, Split, Digits from tokenizers.decoders import ByteLevel as ByteLevelDecoder from tokenizers.trainers import BpeTrainer def ensure_dir(d): if not os.path.exists(d): os.makedirs(d, exist_ok=True) def read_json(file): return json.load(open(file)) def jsonl_batch_generator(jsonl_file: str, batch_size: int): """ A generator function that yields sentences from a JSONL file. Assumes each line is a JSON object with a 'text' field. """ batch = [] with open(jsonl_file, 'r', encoding='utf-8') as f: for line in f: data = json.loads(line) batch.append(data["text"]) if len(batch) >= batch_size: yield batch batch = [] if batch: yield batch def train_or_extend_tokenizer( text_files: str, vocab_size: int = 100000, do_whitespace_pretokenization: bool = True, ): tokenizer = Tokenizer(BPE()) trainer = BpeTrainer(show_progress=True, vocab_size=vocab_size) regex_string = "(?=(\d{3})+(?!\d))" # pretokenize digits in groups of 3 from right to left (from Luca) if do_whitespace_pretokenization: regex_string += ( "| ?\p{L}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+" # GPT-2 pretokenization ) pretokenizers = [ Digits(individual_digits=False), Split( pattern=Regex(regex_string), behavior="isolated", invert=False, ), ByteLevel( add_prefix_space=False, trim_offsets=True, use_regex=False, ), ] tokenizer.pre_tokenizer = pre_tokenizers.Sequence(pretokenizers) tokenizer.decoder = ByteLevelDecoder(add_prefix_space=False, trim_offsets=True, use_regex=False) generator = jsonl_batch_generator(text_files, 1024) tokenizer.train_from_iterator(generator, trainer=trainer) # tokenizer.train(text_files, trainer) return tokenizer