1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
schema: '2.0'
stages:
collect-isbns:
cmd: python ../run.py --rust collect-isbns -o all-isbns.parquet all-isbns.toml
deps:
- path: ../az2014/ratings.parquet
md5: f3e8018263dadb98a221f7a0fd0f6821
size: 147804724
- path: ../bx/cleaned-ratings.csv
md5: da3196d4af84ae39b3713e73c59677c7
size: 22627884
- path: ../goodreads/gr-book-ids.parquet
md5: bd138e1db016e3533554f08971ed28eb
size: 36155423
- path: ../loc-mds/book-isbns.parquet
md5: b692a9b00ba567028f566e985e503722
size: 54291815
- path: ../openlibrary/edition-isbns.parquet
md5: 4aae4d3eaf2ff663d71b9965ccf59133
size: 132821740
- path: ../src/cli/collect_isbns.rs
md5: 6cd9a1b90cb1d613ff7ba73c6f8be741
size: 2911
- path: all-isbns.toml
md5: 8d1990ca34a435e464021afb3b5fc9ed
size: 399
outs:
- path: all-isbns.parquet
md5: 29cc7ae0b86b0e0f5aa7389fdea7de5b
size: 193709539
cluster:
cmd: python run.py --rust cluster-books
deps:
- path: book-links/all-isbns.parquet
md5: 29cc7ae0b86b0e0f5aa7389fdea7de5b
size: 193709539
- path: goodreads/book-isbn-ids.parquet
md5: 299f5bfc7c2244f9a83598dbd56d6719
size: 19658190
- path: goodreads/gr-book-ids.parquet
md5: bd138e1db016e3533554f08971ed28eb
size: 36155423
- path: loc-mds/book-ids.parquet
md5: 7f47ba0c4aa3b202f0f1f93774b40f17
size: 64587938
- path: loc-mds/book-isbn-ids.parquet
md5: 8a1b9d423ecf63253f29cc0ee2d7e8df
size: 46596848
- path: openlibrary/all-works.parquet
md5: 43534d6324f0021769fdd8ed3287c48b
size: 147842482
- path: openlibrary/edition-isbn-ids.parquet
md5: f1bb7e35d883c19e733608ae58d8981b
size: 127237646
- path: openlibrary/edition-works.parquet
md5: 295d170711290ac99b92e60e179bcf7f
size: 146555043
- path: openlibrary/editions.parquet
md5: 48972480e58c663c3feff6e3d698c2ba
size: 723946186
- path: src/cli/cluster_books.rs
md5: a7be7a18af7ac92557becf4c84fae075
size: 4615
- path: src/graph/
md5: eaaf19b503f6fff1db4a6360e50f3d64.dir
size: 10386
nfiles: 4
outs:
- path: book-links/book-graph.mp.zst
md5: 5c8a331e19adc1adf997fcf516fc8ad9
size: 1055861783
- path: book-links/cluster-graph-edges.parquet
md5: ec419d3c7da6aee47ebeeb3d1e14ac15
size: 360353914
- path: book-links/cluster-graph-nodes.parquet
md5: faa95b816d304431065f2c83caf85d02
size: 625547671
- path: book-links/cluster-metrics.json
md5: 285ad00f741e212b9e89405ced7deffa
size: 37
- path: book-links/cluster-stats.parquet
md5: 00343d6b07198c9d26de2d46ce785206
size: 122446262
- path: book-links/isbn-clusters.parquet
md5: 9bb3676bec6b5b3c03fe4c25701c21f4
size: 229330723
cluster-ol-first-authors:
cmd: python run.py --rust cluster extract-authors -o book-links/cluster-ol-first-authors.parquet
--first-author -s openlib
deps:
- path: book-links/isbn-clusters.parquet
md5: 9bb3676bec6b5b3c03fe4c25701c21f4
size: 229330723
- path: openlibrary/author-names.parquet
md5: bbe41b19f31882eb0128142b351cc077
size: 126448846
- path: openlibrary/edition-authors.parquet
md5: 52d351f649c6cce481da4b4df6393940
size: 136280002
- path: openlibrary/edition-isbn-ids.parquet
md5: f1bb7e35d883c19e733608ae58d8981b
size: 127237646
- path: src/cli/cluster
md5: a1d600e4111815f34cd366be955d3f8f.dir
size: 25891
nfiles: 11
outs:
- path: book-links/cluster-ol-first-authors.parquet
md5: 04895650f15e77b085b15b718dbc65c9
size: 144423691
cluster-loc-first-authors:
cmd: python run.py --rust cluster extract-authors -o book-links/cluster-loc-first-authors.parquet
--first-author -s loc
deps:
- path: book-links/isbn-clusters.parquet
md5: 9bb3676bec6b5b3c03fe4c25701c21f4
size: 229330723
- path: loc-mds/book-authors.parquet
md5: 8bb05bd8c1cd99c6d7d10b1ab10d6489
size: 103616710
- path: loc-mds/book-isbn-ids.parquet
md5: 8a1b9d423ecf63253f29cc0ee2d7e8df
size: 46596848
- path: src/cli/cluster
md5: a1d600e4111815f34cd366be955d3f8f.dir
size: 25891
nfiles: 11
outs:
- path: book-links/cluster-loc-first-authors.parquet
md5: 64f368ee99c996d5b685a4e6538a0ebc
size: 55799494
cluster-irst-authors:
cmd: python run.py --rust cluster-authors -o book-links/cluster-first-authors.parquet
--first-author -s openlib -s loc
deps:
- path: book-links/isbn-clusters.parquet
md5: 5ee9bb4b67de722e24363e140ea2791f
size: 161946382
- path: loc-mds/book-authors.parquet
md5: 2d3212dbf2405c48fb8bf2587a8587c3
size: 127567010
- path: loc-mds/book-isbn-ids.parquet
md5: 6a764a693a9baad23d7c489cd3f7bfc9
size: 70553002
- path: openlibrary/author-names.parquet
md5: 4e002793c585a769fc1334827697837e
size: 142462847
- path: openlibrary/edition-authors.parquet
md5: 7172b92182942d728830cdac0b4862b6
size: 164027959
- path: openlibrary/edition-isbn-ids.parquet
md5: afa5abff7a53ed402ba9ce4fb2a09635
size: 214125175
- path: src/bin/cluster-authors.rs
md5: 76f614625858e7ad84e273e875861490
size: 5253
outs:
- path: book-links/cluster-first-authors.parquet
md5: a7f439659cccd809efe4ac2ea276b058
size: 157637977
cluster-first-authors:
cmd: python run.py --rust cluster extract-authors -o book-links/cluster-first-authors.parquet
--first-author -s openlib -s loc
deps:
- path: book-links/isbn-clusters.parquet
md5: 9bb3676bec6b5b3c03fe4c25701c21f4
size: 229330723
- path: loc-mds/book-authors.parquet
md5: 8bb05bd8c1cd99c6d7d10b1ab10d6489
size: 103616710
- path: loc-mds/book-isbn-ids.parquet
md5: 8a1b9d423ecf63253f29cc0ee2d7e8df
size: 46596848
- path: openlibrary/author-names.parquet
md5: bbe41b19f31882eb0128142b351cc077
size: 126448846
- path: openlibrary/edition-authors.parquet
md5: 52d351f649c6cce481da4b4df6393940
size: 136280002
- path: openlibrary/edition-isbn-ids.parquet
md5: f1bb7e35d883c19e733608ae58d8981b
size: 127237646
- path: src/cli/cluster
md5: a1d600e4111815f34cd366be955d3f8f.dir
size: 25891
nfiles: 11
outs:
- path: book-links/cluster-first-authors.parquet
md5: 36512d5a6527bf991d128972581f32e3
size: 177650717
cluster-genders:
cmd: python run.py --rust cluster extract-author-gender -o book-links/cluster-genders.parquet
-A book-links/cluster-first-authors.parquet
deps:
- path: book-links/cluster-first-authors.parquet
md5: 36512d5a6527bf991d128972581f32e3
size: 177650717
- path: book-links/cluster-stats.parquet
md5: 00343d6b07198c9d26de2d46ce785206
size: 122446262
- path: src/cli/cluster
md5: a1d600e4111815f34cd366be955d3f8f.dir
size: 25891
nfiles: 11
- path: viaf/author-genders.parquet
md5: cbd6871e6ba229881aed5f58126fd652
size: 102791330
- path: viaf/author-name-index.parquet
md5: 663023a30c3d2f93ca4dea7d26793e9d
size: 483832252
outs:
- path: book-links/cluster-genders.parquet
md5: 8cb24dcbda2c1559fbfd67583b81e60f
size: 106885602
gender-stats:
cmd: python ../run.py --rust fusion integration-stats.tcl
deps:
- path: ../az2014/az-cluster-ratings.parquet
md5: 674a41980f410014f5e04036cecec492
size: 302707350
- path: ../bx/bx-cluster-actions.parquet
md5: ef6ef99b0f6702258ee776afe194460c
size: 7251481
- path: ../bx/bx-cluster-ratings.parquet
md5: b37f08a3f38031c23c4db5cd73a5dc7d
size: 3043911
- path: ../goodreads/gr-cluster-actions.parquet
md5: 6cc9fcf431dc4a72d0efc16402bb522a
size: 2917412389
- path: ../goodreads/gr-cluster-ratings.parquet
md5: 223918febaed7724a1e77f60e964ed5f
size: 1400932246
- path: ../loc-mds/book-isbn-ids.parquet
md5: 8a1b9d423ecf63253f29cc0ee2d7e8df
size: 46596848
- path: cluster-genders.parquet
md5: 8cb24dcbda2c1559fbfd67583b81e60f
size: 106885602
- path: integration-stats.tcl
md5: 0b47c45c920d62c0c476e1eca3b312a5
size: 2019
- path: isbn-clusters.parquet
md5: 9bb3676bec6b5b3c03fe4c25701c21f4
size: 229330723
outs:
- path: gender-stats.csv
md5: 44405ed9653db449ebf4588d163a047d
size: 1115
cluster-hashes:
cmd: python ../run.py --rust cluster hash -o cluster-hashes.parquet
deps:
- path: ../src/cli/cluster
md5: a1d600e4111815f34cd366be955d3f8f.dir
size: 25891
nfiles: 11
- path: isbn-clusters.parquet
md5: 9bb3676bec6b5b3c03fe4c25701c21f4
size: 229330723
outs:
- path: cluster-hashes.parquet
md5: 80bc62083421cbab1558545bfa3aca08
size: 270217045
debug-graph:
cmd: python ./run.py --rust extract-graph --cluster 100004141 -o book-links/debug-graph.gml
deps:
- path: book-links/book-graph.mp.zst
md5: cbb067efc568960af703f83f5b73e9ac
size: 1283174149
- path: src/bin/extract-graph.rs
md5: c22a75eb91d779306a11d8bd2065b3e6
size: 1142
outs:
- path: book-links/debug-graph.gml
md5: b2fb3c4a07ace6f85a1b880b54ab5da8
size: 2144356
schema@cluster-stats:
cmd: python ../run.py --rust pq-info -o cluster-stats.json cluster-stats.parquet
deps:
- path: cluster-stats.parquet
md5: 00343d6b07198c9d26de2d46ce785206
size: 122446262
outs:
- path: cluster-stats.json
md5: fdf6157f2ed9d75095fa3c930f175467
size: 1232
schema@cluster-first-authors:
cmd: python ../run.py --rust pq-info -o cluster-first-authors.json cluster-first-authors.parquet
deps:
- path: cluster-first-authors.parquet
md5: 36512d5a6527bf991d128972581f32e3
size: 177650717
outs:
- path: cluster-first-authors.json
md5: 21429ff175c1e0f0ee1baf3ba2e94044
size: 358
schema@cluster-genders:
cmd: python ../run.py --rust pq-info -o cluster-genders.json cluster-genders.parquet
deps:
- path: cluster-genders.parquet
md5: 8cb24dcbda2c1559fbfd67583b81e60f
size: 106885602
outs:
- path: cluster-genders.json
md5: 6c8281c59066f473d0b429fb1c3697e0
size: 353
schema@all-isbns:
cmd: python ../run.py --rust pq-info -o all-isbns.json all-isbns.parquet
deps:
- path: all-isbns.parquet
md5: 29cc7ae0b86b0e0f5aa7389fdea7de5b
size: 193709539
outs:
- path: all-isbns.json
md5: 11ba5c5b21fbf50c9513659bd3d7cc63
size: 1039
schema@cluster-hashes:
cmd: python ../run.py --rust pq-info -o cluster-hashes.json cluster-hashes.parquet
deps:
- path: cluster-hashes.parquet
md5: 80bc62083421cbab1558545bfa3aca08
size: 270217045
outs:
- path: cluster-hashes.json
md5: e0b43e5875d8df9e9aff168d83a2cf6e
size: 500
schema@isbn-clusters:
cmd: python ../run.py --rust pq-info -o isbn-clusters.json isbn-clusters.parquet
deps:
- path: isbn-clusters.parquet
md5: 9bb3676bec6b5b3c03fe4c25701c21f4
size: 229330723
outs:
- path: isbn-clusters.json
md5: 766771cee94d211bc8f0b980cd2844bf
size: 492
Tip!
Press p or to see the previous file or,
n or to see the next file