Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

molecule_binary_dataset.py 2.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
  1. # coding=utf-8
  2. import os
  3. import re
  4. import math
  5. import mmap
  6. from typing import Optional
  7. from dataclasses import dataclass
  8. import torch
  9. from nemo.core import Dataset, IterableDataset
  10. from nemo.core.classes.dataset import DatasetConfig
  11. from nemo.utils import logging
  12. from nemo.collections.nlp.data.language_modeling.megatron.indexed_dataset import make_dataset
  13. import time
  14. __all__ = ['MoleculeBinaryDatasetConfig', 'MoleculeBinaryDataset']
  15. @dataclass
  16. class MoleculeBinaryDatasetConfig(DatasetConfig):
  17. filepath: str = 'data.csv'
  18. micro_batch_size: int = 1
  19. use_iterable: bool = False
  20. map_data: bool = False
  21. encoder_augment: bool = True
  22. encoder_mask: bool = False
  23. decoder_augment: bool = False
  24. canonicalize_input: bool = False
  25. metadata_path: Optional[str] = None
  26. num_samples: Optional[int] = None
  27. drop_last: bool = False
  28. shuffle: bool = False
  29. num_workers: Optional[int] = None
  30. pin_memory: bool = True # TODO: remove this if value is fixed
  31. class MoleculeBinaryABCDataset(): # TODO should inheret from MegatronDataset
  32. """Molecule base dataset that reads tokenized data from binarized input files."""
  33. def __init__(self, filepath: str, metadata_path: str = None, num_samples: int = None, map_data: bool = False):
  34. """
  35. Args:
  36. filepath (str): path to dataset file with unmasked tokenized smiles
  37. """
  38. self.filepath = filepath
  39. self._cache = None
  40. def __len__(self):
  41. return self.len
  42. def _initialize_file(self):
  43. start_time = time.time()
  44. self.indexed_dataset = make_dataset(self.filepath,"mmap", skip_warmup=False)
  45. self.len = self.indexed_dataset.sizes.shape[0]
  46. assert self.indexed_dataset.sizes.shape[0] == self.indexed_dataset.doc_idx[-1]
  47. logging.info(' > finished creating indexed dataset in {:4f} ' 'seconds'.format(time.time() - start_time))
  48. logging.info(' > indexed dataset stats:')
  49. logging.info(' number of documents: {}'.format(self.indexed_dataset.doc_idx.shape[0] - 1))
  50. logging.info(' number of sentences: {}'.format(self.indexed_dataset.sizes.shape[0]))
  51. def __exit__(self):
  52. if self.map_data:
  53. self.fh.close()
  54. class MoleculeBinaryDataset(Dataset, MoleculeBinaryABCDataset):
  55. """Dataset that reads GPU-specific portion of data into memory from Binary file"""
  56. def __init__(self, filepath: str, metadata_path: str = None, num_samples: int = None, map_data: bool = False, **kwargs):
  57. super().__init__(filepath=filepath, metadata_path=metadata_path, num_samples=num_samples, map_data=map_data)
  58. self._initialize_file()
  59. def __getitem__(self, idx):
  60. st = time.time()
  61. if torch.is_tensor(idx):
  62. idx = idx.item()
  63. return self.indexed_dataset.get(idx)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...