Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

dataset.py 18 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
  1. """
  2. ExpressionDataset and helper classes
  3. Classes:
  4. ExpressionDataset
  5. GenesIndexes
  6. SamplesIndexes
  7. """
  8. from pathlib import Path
  9. from typing import Callable, Union
  10. from typing import List, Tuple
  11. from functools import cached_property
  12. import pandas as pd
  13. from dataclasses import dataclass
  14. class ExpressionDataset:
  15. '''
  16. ExpressionDataset class to handle: samples, species, genes and expressions
  17. '''
  18. @staticmethod
  19. def load(name: str,
  20. expressions_path: Path,
  21. samples_path: Path,
  22. species_path: Path = None,
  23. genes_path: Path = None,
  24. genes_meta_path: Path = None,
  25. sep="\t",
  26. validate: bool = True):
  27. '''
  28. Loads expression dataset from separate files
  29. :param name:
  30. :param expressions_path: path to .tsv files with expressions
  31. :param samples_path: path to .tsv file with samples
  32. :param species_path: path to .tsv file with species metadata
  33. :param genes_path: path to .tsv file with orthology table
  34. :param genes_meta_path: path to .tsv file with reference genes metadata
  35. :param sep: separator - tab by default
  36. :param validate: if we want to control that expressions have all the samples and genes
  37. :return: ExpressionDataset
  38. '''
  39. expressions = pd.read_csv(expressions_path, sep=sep, index_col="run")
  40. samples = pd.read_csv(samples_path, sep=sep, index_col="run")
  41. species = None if species_path is None else pd.read_csv(species_path, sep=sep, index_col=0)
  42. genes = pd.read_csv(genes_path, sep=sep, index_col=0)
  43. genes_meta = None if genes_meta_path is None else pd.read_csv(genes_meta_path, sep=sep, index_col=0) #species gene symbol
  44. return ExpressionDataset(name, expressions, samples, species, genes, genes_meta, validate=validate)
  45. @staticmethod
  46. def from_folder(folder: Path,
  47. expressions_name: str = "expressions.tsv",
  48. samples_name: str = "samples.tsv",
  49. species_name: str = "species.tsv",
  50. genes_name: str = "genes.tsv",
  51. genes_meta_name: str = "genes_meta.tsv",
  52. sep="\t",
  53. validate: bool = True
  54. ):
  55. '''
  56. Function to load ExpressionDataset from specific folder and configure (if needed) the names of the corresponding files
  57. :param folder:
  58. :param expressions_name:
  59. :param samples_name:
  60. :param species_name:
  61. :param genes_name:
  62. :param genes_meta_name:
  63. :param sep:
  64. :param validate:
  65. :return:
  66. '''
  67. name = folder.name
  68. genes_meta_path = folder / genes_meta_name if (folder / genes_meta_name).exists() else None
  69. species_path = folder / species_name if (folder / species_name).exists() else None
  70. return ExpressionDataset.load(name,
  71. folder / expressions_name,
  72. folder / samples_name,
  73. species_path,
  74. folder / genes_name,
  75. genes_meta_path,
  76. sep,
  77. validate)
  78. def __init__(self,
  79. name: str,
  80. expressions: pd.DataFrame,
  81. samples: pd.DataFrame,
  82. species: pd.DataFrame = None, #additional species info
  83. genes: pd.DataFrame = None,
  84. genes_meta: pd.DataFrame = None, #for gene symbols and other useful info
  85. validate: bool = True #validates shapes of expressions, genes and samples
  86. ):
  87. self.name = name
  88. self.expressions = expressions
  89. self.genes = genes
  90. self.samples = samples
  91. self.species = species
  92. self.genes_meta = genes_meta
  93. if validate:
  94. self.check_rep_inv()
  95. def has_gene_info(self):
  96. return self.genes_meta is not None
  97. def has_species_info(self):
  98. return self.species is not None
  99. @cached_property
  100. def by_genes(self) -> 'GenesIndexes':
  101. return GenesIndexes(self)
  102. @cached_property
  103. def by_species(self) -> 'SpeciesIndexes':
  104. return SpeciesIndexes(self)
  105. @cached_property
  106. def by_samples(self) -> 'SamplesIndexes':
  107. '''
  108. Indexes by samples
  109. :return:
  110. '''
  111. return SamplesIndexes(self)
  112. def __len__(self):
  113. return self.expressions.shape[0]
  114. def get_label(self, label: str) -> pd.DataFrame:
  115. if label in self.samples.columns.to_list():
  116. return self.samples[[label]]
  117. elif (self.species is not None) and label in self.species.columns.to_list():
  118. return self.species[[label]]
  119. elif label in self.expressions.columns.to_list():
  120. return self.expressions[[label]]
  121. elif (self.genes_meta is not None) and label in self.genes_meta.columns.to_list():
  122. return self.genes_meta[[label]]
  123. else:
  124. assert label in self.genes.columns.to_list(), f"cannot find label {label} anywhere!"
  125. def extended_samples(self, samples_columns: List[str] = None, species_columns: List[str] = None):
  126. '''
  127. Merges samples with species dataframes
  128. :return:
  129. '''
  130. assert self.has_species_info(), "to get extended samples information there should be species info"
  131. if samples_columns is None:
  132. smp = self.samples.columns.to_list()
  133. elif "species" in samples_columns:
  134. smp = samples_columns
  135. else:
  136. smp = samples_columns + ["species"]
  137. samples = self.samples if samples_columns is None else self.samples[smp]
  138. species = self.species if species_columns is None else self.species[species_columns]
  139. merged = samples.merge(species, how="inner", left_on="species", right_index=True)
  140. return merged #if (samples_columns is not None and "species" in samples_columns) else merged.drop(columns = ["species"])
  141. def check_rep_inv(self):
  142. """
  143. Checks the class representation invariant.
  144. - rownames in data == rownames in samples_meta
  145. - colnames in data == rownames in features_meta
  146. Raises: AssertionError when violated.
  147. """
  148. assert (self.expressions.index == self.samples.index).all(), f"Expressions dataframe {self.expressions.shape} and samples {self.samples.shape} are incompatible."
  149. assert (self.expressions.columns == self.genes.index).all(), f"Expressions dataframe {self.expressions.shape} and genes {self.genes.shape} are incompatible."
  150. def copy(self): #TODO copy-meta (if exists)
  151. return ExpressionDataset(self.name,
  152. self.expressions.copy(),
  153. self.samples.copy(),
  154. self.species.copy() if self.species is not None else None,
  155. self.genes.copy() if self.genes is not None else None,
  156. self.genes_meta.copy() if self.genes_meta is not None else None
  157. )
  158. def __getitem__(self, items: tuple or List[str] or str):
  159. """
  160. Main indexer of ExpressionDataset
  161. :param items:
  162. :return: dataset[genes, samples] or dataset.by_genes[genes] if samples not specified
  163. """
  164. if type(items) == tuple and type(items[1]) != slice:
  165. ensembl_ids = [items[0]] if type(items[0]) == str else items[0]
  166. runs = [items[1]] if type(items[1]) == str else items[1]
  167. upd_genes = self.genes.loc[ensembl_ids]
  168. upd_samples = self.samples.loc[runs]
  169. upd_expressions = self.expressions.loc[runs][ensembl_ids]
  170. return ExpressionDataset(self.name, upd_expressions, upd_genes, upd_samples)
  171. elif type(items) == tuple and type(items[0]) == slice:
  172. return self.by_samples[items[1]]
  173. else:
  174. return self.by_genes[items]
  175. def drop_not_expressed_in(self, species: Union[List[str], str], thresh: float = 0.001):
  176. sps = species if isinstance(species, List) else [species]
  177. ss = self.samples[self.samples["species"].isin(sps)]
  178. sub_selection: pd.DataFrame = self.expressions.loc[ss.index].dropna(axis=1, thresh=thresh).copy()
  179. return self.collect(lambda exp: exp.loc[:, sub_selection.columns])
  180. @property
  181. def shape(self):
  182. return [self.expressions.shape, self.genes.shape, self.samples.shape]
  183. def __repr__(self):
  184. #to fix jupyter freeze (see https://github.com/ipython/ipython/issues/9771 )
  185. return self._repr_html_()
  186. def _repr_html_(self):
  187. '''
  188. Function to provide nice HTML outlook in jupyter lab notebooks
  189. :return:
  190. '''
  191. gs = str(None) if self.genes_meta is None else str(self.genes_meta.shape)
  192. ss = str(None) if self.species is None else str(self.species.shape)
  193. return f"<table border='2'>" \
  194. f"<caption>{self.name}<caption>" \
  195. f"<tr><th>expressions</th><th>genes</th><th>species</th><th>samples</th><th>Genes Metadata</th><th>Species Metadata</th></tr>" \
  196. f"<tr><td>{str(self.expressions.shape)}</td><td>{str(self.genes.shape)}</td><td>{str(self.genes.shape[1]+1)}</td><td>{str(self.samples.shape[0])}</td><td>{gs}</td><td>{ss}</td></tr>" \
  197. f"</table>"
  198. def write(self, folder: Path or str,
  199. expressions_name: str = "expressions.tsv",
  200. samples_name: str = "samples.tsv",
  201. species_name: str = "species.tsv",
  202. genes_name: str = "genes.tsv",
  203. genes_meta_name: str = "genes_meta.tsv",
  204. name_as_folder: bool = True,
  205. sep: str = "\t"):
  206. '''
  207. Writes ExpressionDataset to specific folder
  208. :param folder:
  209. :param expressions_name:
  210. :param samples_name:
  211. :param species_name:
  212. :param genes_name:
  213. :param genes_meta_name:
  214. :param name_as_folder:
  215. :param sep:
  216. :return:
  217. '''
  218. d: Path = folder if type(folder) == Path else Path(folder)
  219. folder: Path = d / self.name if name_as_folder else d
  220. folder.mkdir(parents=True, exist_ok=True) #create if not exist
  221. self.expressions.to_csv(folder / expressions_name, sep=sep, index = True)
  222. self.genes.to_csv(folder / genes_name, sep = sep, index = True)
  223. self.samples.to_csv(folder / samples_name, sep=sep, index = True)
  224. if self.genes_meta is not None:
  225. self.genes_meta.to_csv(folder / genes_meta_name, sep=sep, index=True)
  226. if self.species is not None:
  227. self.species.to_csv(folder / species_name, sep=sep, index=True)
  228. print(f"written {self.name} dataset content to {str(folder)}")
  229. return folder
  230. def dropna(self, thresh: float):
  231. return self.collect(lambda exp: exp.dropna(thresh = thresh, axis = 1))
  232. def collect(self, collect_fun: Callable[[pd.DataFrame], pd.DataFrame]) -> 'ExpressionDataset':
  233. '''
  234. Collects expressions and rewrites other dataframes
  235. :param collect_fun:
  236. :return:
  237. '''
  238. upd_expressions: pd.DataFrame = collect_fun(self.expressions.copy())
  239. upd_genes = self.genes.loc[upd_expressions.columns].copy()#.reindex(upd_expressions.columns)
  240. upd_samples = self.samples.loc[upd_expressions.index].copy()#.reindex(upd_expressions.index)
  241. upd_genes_meta = None if self.genes_meta is None else self.genes_meta.loc[upd_genes.index].copy()
  242. species_index = upd_samples["species"].drop_duplicates()
  243. upd_species = None if self.species is None else self.species.loc[species_index].copy()
  244. #if upd_genes_meta is not None:
  245. # upd_genes_meta.index.name = "ensembl_id"
  246. return ExpressionDataset(self.name, upd_expressions, upd_samples, upd_species, upd_genes, upd_genes_meta)
  247. def min_max_trait(self, trait: str) -> List:
  248. return [self.species[trait].idxmin(), self.species[trait].idxmax()]
  249. @dataclass(frozen=True)
  250. class SpeciesIndexes:
  251. """
  252. Represent indexing by species (returns all samples that fit species
  253. """
  254. dataset: ExpressionDataset
  255. def __getitem__(self, item) -> ExpressionDataset:
  256. '''
  257. Samples index function
  258. :param item:
  259. :return:
  260. '''
  261. #assert self.dataset.species is not None, "You can use species index only if you have species annotation!"
  262. items = [item] if type(item) == str else item
  263. return self.dataset.by_samples.filter(lambda samples: samples[self.dataset.samples["species"].isin(items)])
  264. def filter(self, filter_fun: Callable[[pd.DataFrame], pd.DataFrame]):
  265. assert self.dataset.species is not None, "You can use species index only if you have species annotation!"
  266. return self.collect(lambda df: self.dataset.species[filter_fun(df)])
  267. def dropna(self, columns: Union[str, List[str]]) -> ExpressionDataset:
  268. sub: List[str] = [columns] if isinstance(columns, str) else columns
  269. return self.collect(lambda species: species.dropna(subset=sub))
  270. def collect(self, collect_fun: Callable[[pd.DataFrame], pd.DataFrame]):
  271. assert self.dataset.species is not None, "You can use species index only if you have species annotation!"
  272. upd_species = collect_fun(self.dataset.species.copy())
  273. upd_samples: pd.DataFrame = self.dataset.samples[self.dataset.samples.species.isin(upd_species.index)].copy()
  274. species_to_select = [s for s in upd_species.index.to_list() if s != self.dataset.genes.index.name]
  275. upd_expressions: pd.DataFrame = self.dataset.expressions.loc[upd_samples.index].copy()
  276. upd_genes = self.dataset.genes[species_to_select].copy()
  277. upd_genes_meta: pd.DataFrame = None if self.dataset.genes_meta is None else self.dataset.genes_meta.loc[upd_genes.index]
  278. #if upd_genes_meta is not None:
  279. # upd_genes_meta.index.name = "ensembl_id"
  280. #upd_genes_meta = None if self.dataset.genes_meta is None else upd_genes
  281. #upd_expressions = upd_expressions.reindex(upd_samples.index)
  282. return ExpressionDataset(self.dataset.name, upd_expressions, upd_samples, upd_species, upd_genes, upd_genes_meta)
  283. @dataclass(frozen=True)
  284. class SamplesIndexes:
  285. dataset: ExpressionDataset
  286. def collect(self, filter_fun: Callable[[pd.DataFrame], pd.DataFrame]) -> ExpressionDataset:
  287. '''
  288. Function to transform the samples (and filter related data in expressions dataframe) according to the lambda provided
  289. :param filter_fun:
  290. :return:
  291. '''
  292. upd_samples: pd.DataFrame = filter_fun(self.dataset.samples.copy())
  293. upd_expressions: pd.DataFrame = self.dataset.expressions.loc[upd_samples.index].copy()
  294. species_index = upd_samples["species"].drop_duplicates()
  295. upd_species = None if self.dataset.species is None else self.dataset.species.copy().loc[species_index]
  296. upd_genes = self.dataset.genes[[s for s in species_index.to_list() if s != self.dataset.genes.index.name]].copy()
  297. upd_genes_meta: pd.DataFrame = None if self.dataset.genes_meta is None else self.dataset.genes_meta.loc[upd_genes.index]
  298. if upd_genes_meta is not None:
  299. upd_genes_meta.index.name = "ensembl_id"
  300. #upd_genes_meta = None if self.dataset.genes_meta is None else upd_genes
  301. #upd_expressions = upd_expressions.reindex(upd_samples.index)
  302. return ExpressionDataset(self.dataset.name, upd_expressions, upd_samples, upd_species, upd_genes, upd_genes_meta)
  303. def filter(self, filter_fun: Callable[[pd.DataFrame], pd.DataFrame]) -> ExpressionDataset:
  304. '''
  305. Function to filter DataSet samples (and filter related data in expressionda dataframe) according to the lambda provided
  306. :param filter_fun:
  307. :return:
  308. '''
  309. return self.collect(lambda df: self.dataset.samples[filter_fun(df)])
  310. def __getitem__(self, item) -> ExpressionDataset:
  311. '''
  312. Samples index function
  313. :param item:
  314. :return:
  315. '''
  316. items = [item] if type(item) == str else item
  317. upd_samples = self.dataset.samples.loc[items]
  318. upd_expressions = self.dataset.expressions.loc[items]
  319. return ExpressionDataset(self.dataset.name, upd_expressions, upd_samples, self.dataset.species, self.dataset.genes, self.dataset.genes_meta)
  320. def _repr_html_(self):
  321. '''
  322. Nice JupyterLab table HTML representation
  323. :return:
  324. '''
  325. return f"<table border='2'>" \
  326. f"<caption>{self.dataset.name} samples view<caption>" \
  327. f"<tr><th>Samples</th>" \
  328. f"<tr><td>{str(self.dataset.samples.shape[0])}</td></tr>" \
  329. f"</table>"
  330. @dataclass(frozen=True)
  331. class GenesIndexes:
  332. dataset: ExpressionDataset
  333. def __getitem__(self, item) -> ExpressionDataset:
  334. items = [item] if type(item) == str else item
  335. return self.collect(lambda gs: gs.loc[items])
  336. def dropna(self, thresh: float):
  337. return self.collect(lambda genes: genes.dropna(thresh=thresh))
  338. def _repr_html_(self):
  339. return f"<table border='2'>" \
  340. f"<caption>{self.dataset.name} Genes view<caption>" \
  341. f"<tr><th>Genes</th><th>Species</th><th>Species</th></tr>" \
  342. f"<tr><td>{str(self.dataset.genes.shape[0])}</td><td>{str(self.dataset.genes.shape[1]+1)}</td></tr>" \
  343. f"</table>"
  344. def collect(self, collect_fun: Callable[[pd.DataFrame], pd.DataFrame]) -> ExpressionDataset:
  345. upd_genes: pd.DataFrame = collect_fun(self.dataset.genes).copy()
  346. upd_expressions = self.dataset.expressions[upd_genes.index].loc[self.dataset.samples.index].copy()
  347. upd_genes_meta = None if self.dataset.genes_meta is None else self.dataset.genes_meta.loc[upd_genes.index]
  348. #if upd_genes_meta is not None:
  349. # upd_genes_meta.index.name = "ensembl_id"
  350. return ExpressionDataset(self.dataset.name, upd_expressions, self.dataset.samples, self.dataset.species, upd_genes, upd_genes_meta)
  351. def filter(self, filter_fun: Callable[[pd.DataFrame], pd.DataFrame]) -> ExpressionDataset:
  352. return self.collect(lambda df: self.dataset.genes[filter_fun(df)])
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...