Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

train_bpe_tokenizer.py 1.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
  1. import argparse
  2. import os
  3. from tokenizers import Tokenizer
  4. from tokenizers.models import BPE
  5. from tokenizers.trainers import BpeTrainer
  6. from tokenizers.pre_tokenizers import Whitespace
  7. from common.tools import *
  8. parser = argparse.ArgumentParser()
  9. parser.add_argument("DATASET_PATH", help="path to your input CSV", type=str)
  10. args = parser.parse_args()
  11. DATASET_PATH = args.DATASET_PATH
  12. MODEL_DIR = "../models/bpe_tokenizer.json"
  13. CORPUS_TMP_DIR = "./corpus_files"
  14. KERNEL_TEMPLATE_NAME = "kernel{}.txt"
  15. CODE_COLUMN = "code_block"
  16. NOTEBOOK_ID_COLUMN = "kaggle_id"
  17. VOCAB_SIZE = 50000
  18. MIN_FREQ = 3
  19. DROPOUT = 0.15
  20. SPECIAL_TOKENS = []
  21. def make_corpus(df):
  22. print("Creating corpus")
  23. os.mkdir(CORPUS_TMP_DIR)
  24. files = set()
  25. for _, row in df.iterrows():
  26. kernel_path = os.path.join(CORPUS_TMP_DIR, KERNEL_TEMPLATE_NAME.format(row[NOTEBOOK_ID_COLUMN]))
  27. with open(kernel_path, "a") as f:
  28. f.write(row[CODE_COLUMN])
  29. f.write("\n")
  30. files.add(kernel_path)
  31. print("Done")
  32. return list(files)
  33. if __name__ == "__main__":
  34. df = load_data(DATASET_PATH, sep=';')
  35. corpus_files = make_corpus(df)
  36. tokenizer = Tokenizer(BPE(dropout=DROPOUT))
  37. for i in range(50):
  38. SPECIAL_TOKENS.append("[VAR" + str(i) + "]")
  39. trainer = BpeTrainer(vocab_size=VOCAB_SIZE, min_frequency=MIN_FREQ, special_tokens=SPECIAL_TOKENS)
  40. tokenizer.pre_tokenizer = Whitespace()
  41. print("Start BPE training")
  42. tokenizer.train(corpus_files, trainer)
  43. print("Done")
  44. tokenizer.save(MODEL_DIR)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...