Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

build_tokenizer.py 1.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
  1. from pathlib import Path
  2. import hydra
  3. import pandas as pd
  4. from molbart.utils.tokenizer import ChemformerTokenizer
  5. from molbart.utils.data_utils import REGEX
  6. def read_extra_tokens(paths):
  7. extra_tokens = []
  8. for path in paths:
  9. p = Path(path)
  10. if p.is_file():
  11. text = p.read_text()
  12. tokens = text.split("\n")
  13. tokens = [token for token in tokens if token != ""]
  14. print(f"Read {len(tokens)} tokens from {path}")
  15. extra_tokens.extend(tokens)
  16. return extra_tokens
  17. def build_unused_tokens(num_tokens):
  18. tokens = []
  19. for i in range(num_tokens):
  20. token = f"<UNUSED_{str(i)}>"
  21. tokens.append(token)
  22. return tokens
  23. @hydra.main(version_base=None, config_path="config", config_name="build_tokeniser")
  24. def main(args):
  25. print("Reading molecule dataset...")
  26. mol_dataset = pd.read_pickle(args.data_path)
  27. smiles = mol_dataset[args.smiles_column].values.tolist()
  28. print("Completed reading dataset.")
  29. print("Reading extra tokens...")
  30. paths = [args.mol_opt_tokens_path, args.prop_pred_tokens_path]
  31. extra_tokens = read_extra_tokens(paths)
  32. unused_tokens = build_unused_tokens(args.num_unused_tokens)
  33. print("Completed reading extra tokens.")
  34. print("Building tokenizer...")
  35. tokenizer = ChemformerTokenizer(
  36. smiles=smiles,
  37. tokens=extra_tokens + unused_tokens,
  38. regex_token_patterns=REGEX.split("|"),
  39. )
  40. print("Completed building tokenizer.")
  41. print("Writing tokenizer...")
  42. tokenizer.save_vocabulary(args.tokeniser_path)
  43. print("Complete.")
  44. if __name__ == "__main__":
  45. main()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...