Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

download_images.py 1.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
  1. import os
  2. import json
  3. import shutil
  4. from tqdm import tqdm
  5. import tarfile
  6. import argparse
  7. from urllib.error import HTTPError
  8. import urllib.request
  9. def main(args):
  10. input_data = []
  11. with open(args.input_path) as f:
  12. for line in f:
  13. input_data.append(json.loads(line))
  14. # Download all PMC articles
  15. print('Downloading PMC articles')
  16. for idx, sample in enumerate(tqdm(input_data)):
  17. try:
  18. urllib.request.urlretrieve(sample['pmc_tar_url'], os.path.join(args.pmc_output_path, os.path.basename(sample['pmc_tar_url'])))
  19. except HTTPError as e:
  20. print('Error downloading PMC article: {}'.format(sample['pmc_tar_url']))
  21. continue
  22. # Untar all PMC articles
  23. print('Untarring PMC articles')
  24. for sample in tqdm(input_data):
  25. fname = os.path.join(args.pmc_output_path, os.path.basename(os.path.join(sample['pmc_tar_url'])))
  26. tar = tarfile.open(fname, "r:gz")
  27. tar.extractall(args.pmc_output_path)
  28. tar.close()
  29. # Copy to images directory
  30. print('Copying images')
  31. for sample in tqdm(input_data):
  32. src = os.path.join(args.pmc_output_path, sample['image_file_path'])
  33. dst = os.path.join(args.images_output_path, sample['pair_id']+'.jpg')
  34. shutil.copyfile(src, dst)
  35. if __name__ == '__main__':
  36. parser = argparse.ArgumentParser()
  37. parser.add_argument('--input_path', type=str, default='data/llava_med_image_urls.jsonl')
  38. parser.add_argument('--pmc_output_path', type=str, default='data/pmc_articles/')
  39. parser.add_argument('--images_output_path', type=str, default='data/images/')
  40. args = parser.parse_args()
  41. main(args)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...