Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

dataset.py 14 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
  1. """
  2. ExpressionDataset and helper classes
  3. Classes:
  4. ExpressionDataset
  5. GenesIndexes
  6. SamplesIndexes
  7. """
  8. from pathlib import Path
  9. from typing import Callable
  10. from typing import List
  11. import pandas as pd
  12. class ExpressionDataset:
  13. '''
  14. ExpressionDataset class to handle: samples, species, genes and expressions
  15. '''
  16. @staticmethod
  17. def load(name: str,
  18. expressions_path: Path,
  19. samples_path: Path,
  20. species_path: Path = None,
  21. genes_path: Path = None,
  22. genes_meta_path: Path = None,
  23. sep="\t",
  24. validate: bool = True):
  25. '''
  26. Loads expression dataset from separate files
  27. :param name:
  28. :param expressions_path: path to .tsv files with expressions
  29. :param samples_path: path to .tsv file with samples
  30. :param species_path: path to .tsv file with species metadata
  31. :param genes_path: path to .tsv file with orthology table
  32. :param genes_meta_path: path to .tsv file with reference genes metadata
  33. :param sep: separator - tab by default
  34. :param validate: if we want to control that expressions have all the samples and genes
  35. :return: ExpressionDataset
  36. '''
  37. expressions = pd.read_csv(expressions_path, sep=sep, index_col="run")
  38. samples = pd.read_csv(samples_path, sep=sep, index_col="run")
  39. species = None if species_path is None else pd.read_csv(species_path, sep=sep, index_col="species")
  40. genes = pd.read_csv(genes_path, sep=sep, index_col="Homo_sapiens")
  41. genes_meta = None if genes_meta_path is None else pd.read_csv(genes_meta_path, sep=sep, index_col="ensembl_id") #species gene symbol
  42. return ExpressionDataset(name, expressions, samples, species, genes, genes_meta, validate=validate)
  43. @staticmethod
  44. def from_folder(folder: Path,
  45. expressions_name: str = "expressions.tsv",
  46. samples_name: str = "samples.tsv",
  47. species_name: str = "species.tsv",
  48. genes_name: str = "genes.tsv",
  49. genes_meta_name: str = "genes_meta.tsv",
  50. sep="\t",
  51. validate: bool = True
  52. ):
  53. '''
  54. Function to load ExpressionDataset from specific folder and configure (if needed) the names of the corresponding files
  55. :param folder:
  56. :param expressions_name:
  57. :param samples_name:
  58. :param species_name:
  59. :param genes_name:
  60. :param genes_meta_name:
  61. :param sep:
  62. :param validate:
  63. :return:
  64. '''
  65. name = folder.name
  66. genes_meta_path = folder / genes_meta_name if (folder / genes_meta_name).exists() else None
  67. species_path = folder / species_name if (folder / species_name).exists() else None
  68. return ExpressionDataset.load(name,
  69. folder / expressions_name,
  70. folder / samples_name,
  71. species_path,
  72. folder / genes_name,
  73. genes_meta_path,
  74. sep,
  75. validate)
  76. def __init__(self,
  77. name: str,
  78. expressions: pd.DataFrame,
  79. samples: pd.DataFrame,
  80. species: pd.DataFrame = None, #additional species info
  81. genes: pd.DataFrame = None,
  82. genes_meta: pd.DataFrame = None, #for gene symbols and other useful info
  83. validate: bool = True #validates shapes of expressions, genes and samples
  84. ):
  85. self.name = name
  86. self.expressions = expressions
  87. self.genes = genes
  88. self.samples = samples
  89. self.species = species
  90. self.genes_meta = genes_meta
  91. if validate:
  92. self.check_rep_inv()
  93. def has_gene_info(self):
  94. return self.genes_meta is not None
  95. def has_species_info(self):
  96. return self.species is not None
  97. @property
  98. def by_genes(self):
  99. return GenesIndexes(self)
  100. @property
  101. def by_samples(self):
  102. '''
  103. Indexes by samples
  104. :return:
  105. '''
  106. return SamplesIndexes(self)
  107. def __len__(self):
  108. return self.expressions.shape[0]
  109. def get_label(self, label: str) -> pd.DataFrame:
  110. if label in self.samples.columns.to_list():
  111. return self.samples[[label]]
  112. elif (self.species is not None) and label in self.species.columns.to_list():
  113. return self.species[[label]]
  114. elif label in self.expressions.columns.to_list():
  115. return self.expressions[[label]]
  116. elif (self.genes_meta is not None) and label in self.genes_meta.columns.to_list():
  117. return self.genes_meta[[label]]
  118. else:
  119. assert label in self.genes.columns.to_list(), f"cannot find label {label} anywhere!"
  120. def extended_samples(self, samples_columns: List[str] = None, species_columns: List[str] = None):
  121. '''
  122. Merges samples with species dataframes
  123. :return:
  124. '''
  125. assert self.has_species_info(), "to get extended samples information there should be species info"
  126. if samples_columns is None:
  127. smp = self.samples.columns.to_list()
  128. elif "species" in samples_columns:
  129. smp = samples_columns
  130. else:
  131. smp = samples_columns + ["species"]
  132. samples = self.samples if samples_columns is None else self.samples[smp]
  133. species = self.species if species_columns is None else self.species[species_columns]
  134. merged = samples.merge(species, how="inner", left_on="species", right_index=True)
  135. return merged #if (samples_columns is not None and "species" in samples_columns) else merged.drop(columns = ["species"])
  136. def check_rep_inv(self):
  137. """
  138. Checks the class representation invariant.
  139. - rownames in data == rownames in samples_meta
  140. - colnames in data == rownames in features_meta
  141. Raises: AssertionError when violated.
  142. """
  143. assert (self.expressions.index == self.samples.index).all(), f"Expressions dataframe {self.expressions.shape} and samples {self.samples.shape} are incompatible."
  144. assert (self.expressions.columns == self.genes.index).all(), f"Expressions dataframe {self.expressions.shape} and genes {self.genes.shape} are incompatible."
  145. def copy(self): #TODO copy-meta (if exists)
  146. return ExpressionDataset(self.name,
  147. self.expressions.copy(),
  148. self.samples.copy(),
  149. self.species.copy() if self.species is not None else None,
  150. self.genes.copy() if self.genes is not None else None,
  151. self.genes_meta.copy() if self.genes_meta is not None else None
  152. )
  153. def __getitem__(self, items: tuple or List[str] or str):
  154. """
  155. Main indexer of ExpressionDataset
  156. :param items:
  157. :return: dataset[genes, samples] or dataset.by_genes[genes] if samples not specified
  158. """
  159. if type(items) == tuple and type(items[1]) != slice:
  160. ensembl_ids = [items[0]] if type(items[0]) == str else items[0]
  161. runs = [items[1]] if type(items[1]) == str else items[1]
  162. upd_genes = self.genes.loc[ensembl_ids]
  163. upd_samples = self.samples.loc[runs]
  164. upd_expressions = self.expressions.loc[runs][ensembl_ids]
  165. return ExpressionDataset(self.name, upd_expressions, upd_genes, upd_samples)
  166. elif type(items) == tuple and type(items[0]) == slice:
  167. return self.by_samples[items[1]]
  168. else:
  169. return self.by_genes[items]
  170. @property
  171. def shape(self):
  172. return [self.expressions, self.genes, self.samples]
  173. def _repr_html_(self):
  174. '''
  175. Function to provide nice HTML outlook in jupyter lab notebooks
  176. :return:
  177. '''
  178. gs = str(None) if self.genes_meta is None else str(self.genes_meta.shape)
  179. ss = str(None) if self.species is None else str(self.species.shape)
  180. return f"<table border='2'>" \
  181. f"<caption>{self.name}<caption>" \
  182. f"<tr><th>expressions</th><th>genes</th><th>species</th><th>samples</th><th>Genes Metadata</th><th>Species Metadata</th></tr>" \
  183. f"<tr><td>{str(self.expressions.shape)}</td><td>{str(self.genes.shape[0])}</td><td>{str(self.genes.shape[1])}</td><td>{str(self.samples.shape[0])}</td><td>{gs}</td><td>{ss}</td></tr>" \
  184. f"</table>"
  185. def write(self, folder: Path or str,
  186. expressions_name: str = "expressions.tsv",
  187. samples_name: str = "samples.tsv",
  188. species_name: str = "species.tsv",
  189. genes_name: str = "genes.tsv",
  190. genes_meta_name: str = "genes_meta.tsv",
  191. name_as_folder: bool = True,
  192. sep: str = "\t"):
  193. '''
  194. Writes ExpressionDataset to specific folder
  195. :param folder:
  196. :param expressions_name:
  197. :param samples_name:
  198. :param species_name:
  199. :param genes_name:
  200. :param genes_meta_name:
  201. :param name_as_folder:
  202. :param sep:
  203. :return:
  204. '''
  205. d: Path = folder if type(folder) == Path else Path(folder)
  206. folder: Path = d / self.name if name_as_folder else d
  207. folder.mkdir(parents=True, exist_ok=True) #create if not exist
  208. self.expressions.to_csv(folder / expressions_name, sep=sep, index = True)
  209. self.genes.to_csv(folder / genes_name, sep = sep, index = True)
  210. self.samples.to_csv(folder / samples_name, sep=sep, index = True)
  211. if self.genes_meta is not None:
  212. self.genes_meta.to_csv(folder / genes_meta_name, sep=sep, index=True)
  213. if self.species is not None:
  214. self.species.to_csv(folder / species_name, sep=sep, index=True)
  215. print(f"written {self.name} dataset content to {str(folder)}")
  216. return folder
  217. def collect(self, filter_fun: Callable[[pd.DataFrame], pd.DataFrame]) -> 'ExpressionDataset':
  218. '''
  219. Collects expressions and rewrites other dataframes
  220. :param filter_fun:
  221. :return:
  222. '''
  223. upd_expressions: pd.DataFrame = filter_fun(self.expressions.copy())
  224. upd_genes = self.genes.loc[upd_expressions.columns].copy()#.reindex(upd_expressions.columns)
  225. upd_samples = self.samples.loc[upd_expressions.index].copy()#.reindex(upd_expressions.index)
  226. upd_genes_meta = None if self.genes_meta is None else self.genes_meta.loc[upd_genes.index]
  227. return ExpressionDataset(self.name, upd_expressions, upd_samples, self.species, upd_genes, upd_genes_meta)
  228. class SamplesIndexes:
  229. """
  230. Representes by_samples indexer, i.d. dataset.by_samples[[gene_ids]]
  231. """
  232. def __init__(self, dataset):
  233. self.dataset = dataset
  234. def collect(self, filter_fun: Callable[[pd.DataFrame], pd.DataFrame]) -> ExpressionDataset:
  235. '''
  236. Function to transform the samples (and filter related data in expressions dataframe) according to the lambda provided
  237. :param filter_fun:
  238. :return:
  239. '''
  240. upd_samples: pd.DataFrame = filter_fun(self.dataset.samples.copy())
  241. runs = upd_samples.index.tolist()
  242. upd_expressions = self.dataset.expressions.loc[runs]
  243. return ExpressionDataset(self.dataset.name, upd_expressions, self.dataset.genes, upd_samples)
  244. def filter(self, filter_fun: Callable[[pd.DataFrame], pd.DataFrame]) -> ExpressionDataset:
  245. '''
  246. Function to filter DataSet samples (and filter related data in expressionda dataframe) according to the lambda provided
  247. :param filter_fun:
  248. :return:
  249. '''
  250. return self.collect(lambda df: self.dataset.samples[filter_fun(df)])
  251. def __getitem__(self, item) -> ExpressionDataset:
  252. '''
  253. Samples index function
  254. :param item:
  255. :return:
  256. '''
  257. items = [item] if type(item) == str else item
  258. upd_samples = self.dataset.samples.loc[items]
  259. upd_expressions = self.dataset.expressions.loc[items]
  260. return ExpressionDataset(self.dataset.name, upd_expressions, upd_samples, self.dataset.species, self.dataset.genes, self.dataset.genes_meta)
  261. def _repr_html_(self):
  262. '''
  263. Nice JupyterLab table HTML representation
  264. :return:
  265. '''
  266. return f"<table border='2'>" \
  267. f"<caption>{self.dataset.name} samples view<caption>" \
  268. f"<tr><th>Samples</th>" \
  269. f"<tr><td>{str(self.dataset.samples.shape[0])}</td></tr>" \
  270. f"</table>"
  271. class GenesIndexes:
  272. """
  273. Representes by_genes indexer, i.d. dataset.by_genes[[gene_ids]]
  274. """
  275. def __init__(self, dataset: ExpressionDataset):
  276. self.dataset = dataset
  277. def __getitem__(self, item) -> ExpressionDataset:
  278. items = [item] if type(item) == str else item
  279. upd_genes = self.dataset.genes.loc[items]
  280. upd_expressions = self.dataset.expressions[items]
  281. return ExpressionDataset(self.dataset.name, upd_expressions, upd_genes, self.dataset.samples)
  282. def _repr_html_(self):
  283. return f"<table border='2'>" \
  284. f"<caption>{self.dataset.name} Genes view<caption>" \
  285. f"<tr><th>Genes</th><th>Species</th><th>Species</th></tr>" \
  286. f"<tr><td>{str(self.dataset.genes.shape[0])}</td><td>{str(self.dataset.genes.shape[1])}</td></tr>" \
  287. f"</table>"
  288. def collect(self, filter_fun: Callable[[pd.DataFrame], pd.DataFrame]) -> ExpressionDataset:
  289. upd_genes: pd.DataFrame = filter_fun(self.dataset.genes.copy())
  290. upd_expressions = self.dataset.expressions[upd_genes.index].copy()
  291. upd_expressions = upd_expressions.loc[self.dataset.samples.index]
  292. upd_genes_meta = None if self.dataset.genes_meta is None else self.dataset.genes_meta.loc[upd_genes.index]
  293. return ExpressionDataset(self.dataset.name, upd_expressions, self.dataset.samples, self.dataset.species, upd_genes, upd_genes_meta)
  294. def filter(self, filter_fun: Callable[[pd.DataFrame], pd.DataFrame]) -> ExpressionDataset:
  295. return self.collect(lambda df: self.dataset.genes[filter_fun(df)])
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...