Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

get_posters.py 1.7 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
  1. import os.path
  2. import numpy as np
  3. import pandas as pd
  4. import matplotlib.pyplot as plt
  5. import seaborn as sns
  6. import requests
  7. from bs4 import BeautifulSoup
  8. from PIL import Image
  9. from io import BytesIO
  10. import re
  11. import json
  12. def save_poster(imdb_id, img_url):
  13. '''
  14. Function that fetches and save the poster image from provided url
  15. and saves it with the provided id (corresponding with IMDb).
  16. INPUT: id from imdb, url where to find image
  17. OUTPUT: boolean flag if saved or not.
  18. '''
  19. # Get image data, and save it as imdb_id
  20. response = requests.get(img_url)
  21. img = Image.open(BytesIO(response.content))
  22. print(f'Saving {imdb_id}')
  23. img.save(f'data/posters/{imdb_id}.jpg')
  24. return True
  25. # get title of movie
  26. def title(index):
  27. return df[df.index == index]["movie_title"].values[0]
  28. # get index of movie
  29. def index(movie_title):
  30. return df[df.movie_title == movie_title]["index"].values[0]
  31. df = pd.read_csv('./data/df_final.csv')
  32. df = df.set_index('id')
  33. imdb_base_url = 'https://www.imdb.com/title/'
  34. #posters = df.loc['tt0756683':].index
  35. posters = df.index
  36. for poster in posters:
  37. print(f'processing {poster}')
  38. imdb_full_url = imdb_base_url + poster
  39. # Check to see if I already have it
  40. if os.path.isfile(f'data/posters/{poster}.jpg'):
  41. print(f'{poster}: we already have that one !')
  42. else:
  43. r = requests.get(imdb_full_url).content
  44. soup = BeautifulSoup(r, 'html.parser')
  45. json_dict = json.loads(str(soup.findAll('script',
  46. {'type':'application/ld+json'})[0].text))
  47. poster_url = json_dict['image']
  48. save_poster(poster, poster_url)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...