Dean
/
MegaMolBART
mirror of https://github.com/NVIDIA/MegaMolBART


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
            # coding=utf-8

import os
import re
import math
import mmap
from typing import Optional
from dataclasses import dataclass

import torch
from nemo.core import Dataset, IterableDataset
from nemo.core.classes.dataset import DatasetConfig
from nemo.utils import logging
from nemo.collections.nlp.data.language_modeling.megatron.indexed_dataset import make_dataset

import time
__all__ = ['MoleculeBinaryDatasetConfig', 'MoleculeBinaryDataset']


@dataclass
class MoleculeBinaryDatasetConfig(DatasetConfig):
    filepath: str = 'data.csv'
    micro_batch_size: int = 1
    use_iterable: bool = False
    map_data: bool = False
    encoder_augment: bool = True
    encoder_mask: bool = False
    decoder_augment: bool = False
    canonicalize_input: bool = False
    metadata_path: Optional[str] = None
    num_samples: Optional[int] = None
    drop_last: bool = False
    shuffle: bool = False
    num_workers: Optional[int] = None
    pin_memory: bool = True # TODO: remove this if value is fixed


class MoleculeBinaryABCDataset(): # TODO should inheret from MegatronDataset
    """Molecule base dataset that reads tokenized data from binarized input files."""
    
    def __init__(self, filepath: str, metadata_path: str = None, num_samples: int = None, map_data: bool = False): 
        """
        Args:
            filepath (str): path to dataset file with unmasked tokenized smiles
        """
        self.filepath = filepath
        self._cache = None

    def __len__(self):
        return self.len
    
    def _initialize_file(self):
        start_time = time.time()
        self.indexed_dataset = make_dataset(self.filepath,"mmap", skip_warmup=False)
        self.len = self.indexed_dataset.sizes.shape[0]
        assert self.indexed_dataset.sizes.shape[0] == self.indexed_dataset.doc_idx[-1]
        logging.info(' > finished creating indexed dataset in {:4f} ' 'seconds'.format(time.time() - start_time))

        logging.info(' > indexed dataset stats:')
        logging.info('    number of documents: {}'.format(self.indexed_dataset.doc_idx.shape[0] - 1))
        logging.info('    number of sentences: {}'.format(self.indexed_dataset.sizes.shape[0]))

    def __exit__(self):
        if self.map_data:
            self.fh.close()


class MoleculeBinaryDataset(Dataset, MoleculeBinaryABCDataset):
    """Dataset that reads GPU-specific portion of data into memory from Binary file"""
    def __init__(self, filepath: str, metadata_path: str = None, num_samples: int = None, map_data: bool = False, **kwargs):
        super().__init__(filepath=filepath, metadata_path=metadata_path, num_samples=num_samples, map_data=map_data)
        self._initialize_file()
        
    def __getitem__(self, idx):
        st = time.time()
        if torch.is_tensor(idx):
            idx = idx.item()
        return self.indexed_dataset.get(idx)