Are you sure you want to delete this access key?
💈 Développez un moteur de recommandation de films
Preamble Emacs Setup noexport(setq org-src-fontify-natively t) (setq org-latex-image-default-width "5cm") (setq org-image-actual-width nil)
Imports%matplotlib inline %load_ext autoreload %autoreload 2
import sys import os import warnings warnings.filterwarnings("ignore") import pickle
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns
import tensorflow as tf
from keras.preprocessing.image import load_img from keras.preprocessing.image import img_to_array from keras.applications.vgg16 import preprocess_input
from keras.applications.vgg16 import VGG16 from keras.models import Model
from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.metrics.pairwise import cosine_similarity
from random import randint import pickle
:results:
:end:
Functionsdef display_all(df): with pd.option_context("display.max_rows", 100, "display.max_columns", 100): display(df)
:results:
1 - e21f656d
-1695-428d-8b46-481324f670
:end:
import IPython import tabulate
class OrgFormatter(IPython.core.formatters.BaseFormatter): format_type = IPython.core.formatters.Unicode('text/org') print_method = IPython.core.formatters.ObjectName('_repr_org_')
def pd_dataframe_to_org(df): return tabulate.tabulate(df, headers='keys', tablefmt='orgtbl', showindex='always')
ip = get_ipython() ip.display_formatter.formatters['text/org'] = OrgFormatter()
f = ip.display_formatter.formatters['text/org'] f.for_type_by_name('pandas.core.frame', 'DataFrame', pd_dataframe_to_org)
:results:
:end:
Load Data Load Datadf = pd.read_csv('../data/processed/movie_metadata_processed.csv')
:results:
:end:
Glimpse at the datadisplay_all(df.describe(include='all').T)
:results:
count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|---|---|---|
Unnamed: 0 | 4688 | nan | nan | nan | 2343.5 | 1353.45 | 0 | 1171.75 | 2343.5 | 3515.25 | 4687 |
id | 4688 | 4688 | tt0006864 | 1 | nan | nan | nan | nan | nan | nan | nan |
color | 4673 | 2 | Color | 4477 | nan | nan | nan | nan | nan | nan | nan |
director_name | 4688 | 2370 | Steven Spielberg | 26 | nan | nan | nan | nan | nan | nan | nan |
actor_3_facebook_likes | 4670 | nan | nan | nan | 620.241 | 1591.97 | 0 | 130 | 362 | 632.75 | 23000 |
actor_2_name | 4678 | 2923 | Morgan Freeman | 16 | nan | nan | nan | nan | nan | nan | nan |
actor_1_facebook_likes | 4681 | nan | nan | nan | 6548.67 | 15335.4 | 0 | 606 | 984 | 11000 | 640000 |
actor_1_name | 4681 | 2020 | Robert De Niro | 47 | nan | nan | nan | nan | nan | nan | nan |
movie_title | 4688 | 4686 | The Host | 2 | nan | nan | nan | nan | nan | nan | nan |
actor_3_name | 4670 | 3389 | Steve Coogan | 8 | nan | nan | nan | nan | nan | nan | nan |
plot_keywords | 4553 | 4545 | based on novel | 4 | nan | nan | nan | nan | nan | nan | nan |
language | 4688 | 47 | English | 4376 | nan | nan | nan | nan | nan | nan | nan |
country | 4687 | 65 | USA | 3540 | nan | nan | nan | nan | nan | nan | nan |
content_rating | 4688 | 12 | R | 2021 | nan | nan | nan | nan | nan | nan | nan |
title_year | 4688 | nan | nan | nan | 2002.42 | 12.4284 | 1916 | 1999 | 2005 | 2011 | 2016 |
actor_2_facebook_likes | 4678 | nan | nan | nan | 1614.51 | 4017.5 | 0 | 275 | 592 | 912 | 137000 |
aspect_ratio | 4388 | nan | nan | nan | 2.12791 | 0.807937 | 1.18 | 1.85 | 2.35 | 2.35 | 16 |
Action | 4688 | nan | nan | nan | 0.225043 | 0.417655 | 0 | 0 | 0 | 0 | 1 |
Adventure | 4688 | nan | nan | nan | 0.180461 | 0.384612 | 0 | 0 | 0 | 0 | 1 |
Animation | 4688 | nan | nan | nan | 0.0492747 | 0.216464 | 0 | 0 | 0 | 0 | 1 |
Biography | 4688 | nan | nan | nan | 0.0612201 | 0.239759 | 0 | 0 | 0 | 0 | 1 |
Comedy | 4688 | nan | nan | nan | 0.380119 | 0.485468 | 0 | 0 | 0 | 1 | 1 |
Crime | 4688 | nan | nan | nan | 0.174275 | 0.379386 | 0 | 0 | 0 | 0 | 1 |
Documentary | 4688 | nan | nan | nan | 0.025384 | 0.157305 | 0 | 0 | 0 | 0 | 1 |
Drama | 4688 | nan | nan | nan | 0.512159 | 0.499905 | 0 | 0 | 1 | 1 | 1 |
Family | 4688 | nan | nan | nan | 0.108575 | 0.311139 | 0 | 0 | 0 | 0 | 1 |
Fantasy | 4688 | nan | nan | nan | 0.116254 | 0.320564 | 0 | 0 | 0 | 0 | 1 |
Film-Noir | 4688 | nan | nan | nan | 0.00127986 | 0.0357561 | 0 | 0 | 0 | 0 | 1 |
History | 4688 | nan | nan | nan | 0.0415956 | 0.199684 | 0 | 0 | 0 | 0 | 1 |
Horror | 4688 | nan | nan | nan | 0.108575 | 0.311139 | 0 | 0 | 0 | 0 | 1 |
Music | 4688 | nan | nan | nan | 0.0441553 | 0.205462 | 0 | 0 | 0 | 0 | 1 |
Musical | 4688 | nan | nan | nan | 0.0273038 | 0.162984 | 0 | 0 | 0 | 0 | 1 |
Mystery | 4688 | nan | nan | nan | 0.0949232 | 0.29314 | 0 | 0 | 0 | 0 | 1 |
News | 4688 | nan | nan | nan | 0.000639932 | 0.0252915 | 0 | 0 | 0 | 0 | 1 |
Romance | 4688 | nan | nan | nan | 0.222056 | 0.415673 | 0 | 0 | 0 | 0 | 1 |
Sci-Fi | 4688 | nan | nan | nan | 0.119027 | 0.323855 | 0 | 0 | 0 | 0 | 1 |
Short | 4688 | nan | nan | nan | 0.00106655 | 0.0326442 | 0 | 0 | 0 | 0 | 1 |
Sport | 4688 | nan | nan | nan | 0.0364761 | 0.187492 | 0 | 0 | 0 | 0 | 1 |
Thriller | 4688 | nan | nan | nan | 0.275384 | 0.446755 | 0 | 0 | 0 | 1 | 1 |
War | 4688 | nan | nan | nan | 0.043302 | 0.203558 | 0 | 0 | 0 | 0 | 1 |
Western | 4688 | nan | nan | nan | 0.019198 | 0.137235 | 0 | 0 | 0 | 0 | 1 |
num_critic_for_reviews | 4688 | nan | nan | nan | 137.259 | 119.006 | 1 | 50 | 108 | 190 | 813 |
duration | 4688 | nan | nan | nan | 107.961 | 22.6209 | 7 | 94 | 103 | 118 | 330 |
gross | 4688 | nan | nan | nan | 4.06176e+07 | 6.27544e+07 | 162 | 2.8341e+06 | 1.72871e+07 | 5.18582e+07 | 7.60506e+08 |
director_facebook_likes | 4688 | nan | nan | nan | 696.231 | 2853.72 | 0 | 7 | 47 | 188.25 | 23000 |
num_voted_users | 4688 | nan | nan | nan | 82861.6 | 139306 | 5 | 8428.5 | 32871 | 93011 | 1.68976e+06 |
cast_total_facebook_likes | 4688 | nan | nan | nan | 9607.98 | 18324.4 | 0 | 1378.75 | 3062.5 | 13650.2 | 656730 |
facenumber_in_poster | 4688 | nan | nan | nan | 1.37521 | 2.03111 | 0 | 0 | 1 | 2 | 43 |
num_user_for_reviews | 4688 | nan | nan | nan | 265.88 | 371.625 | 1 | 64 | 153 | 318 | 5060 |
budget | 4688 | nan | nan | nan | 3.67286e+07 | 2.02791e+08 | 218 | 5.43e+06 | 1.7e+07 | 4e+07 | 1.22155e+10 |
imdb_score | 4688 | nan | nan | nan | 6.41086 | 1.11889 | 1.6 | 5.8 | 6.5 | 7.2 | 9.3 |
movie_facebook_likes | 4688 | nan | nan | nan | 7216.21 | 19170.2 | 0 | 0 | 154 | 2000 | 349000 |
end |
numerical = df.select_dtypes(include='number').columns categorical = df.select_dtypes(exclude='number').columns
print(f"categorical columns are : {', '.join(str(x) for x in categorical)}") print(f"numerical columns are : {', '.join(str(x) for x in numerical)}")
:results: categorical columns are : id, color, director_name, actor_2_name, actor_1_name, movie_title, actor_3_name, plot_keywords, language, country, content_rating numerical columns are : Unnamed: 0, actor_3_facebook_likes, actor_1_facebook_likes, title_year, actor_2_facebook_likes, aspect_ratio, Action, Adventure, Animation, Biography, Comedy, Crime, Documentary, Drama, Family, Fantasy, Film-Noir, History, Horror, Music, Musical, Mystery, News, Romance, Sci-Fi, Short, Sport, Thriller, War, Western, num_critic_for_reviews, duration, gross, director_facebook_likes, num_voted_users, cast_total_facebook_likes, facenumber_in_poster, num_user_for_reviews, budget, imdb_score, movie_facebook_likes :end:
categorical columns are : color, director_name, actor_2_name, genres, actor_1_name, movie_title, actor_3_name, plot_keywords, movie_imdb_link, language, country, content_rating
numerical columns are : num_critic_for_reviews, duration, director_facebook_likes, actor_3_facebook_likes, actor_1_facebook_likes, gross, num_voted_users, cast_total_facebook_likes, facenumber_in_poster, num_user_for_reviews, budget, title_year, actor_2_facebook_likes, imdb_score, aspect_ratio, movie_facebook_likes
Postersprint(os.getcwd()) #os.chdir('./mauviette/notebooks')
:results:
:end:
img_path = '../data/external/posters/' p = r"../data/processed/poster_features.pkl"
:results:
:end:
posters = []
with os.scandir(img_path) as files: # loops through each file in the directory for file in files: if file.name.endswith('.jpg'): # adds only the image files to the posters list posters.append(file.name)
:results:
:end:
Glimpse at a posterimg = load_img(f'{img_path}/{posters[0]}', target_size=(224,224))
img = np.array(img)
print(img.shape)
:results:
:end:
reshaped_img = img.reshape(1,224,224,3) print(reshaped_img.shape)
:results:
:end:
x = preprocess_input(reshaped_img)
:results:
:end:
Clustering Extracting featuresNous pouvons maintenant charger le modèle VGG et supprimer la couche de sortie manuellement. Cela signifie que la nouvelle couche finale est une couche entièrement connectée avec 4 096 nœuds de sortie. Ce vecteur de 4 096 nombres est le vecteur de caractéristiques que nous utiliserons pour regrouper les images.
model = VGG16()
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
:results:
:end:
features = model.predict(x, use_multiprocessing=True)
:results:
:end:
print(features.shape) features
:results:
array([[1.5461166 , 6.260435 , 2.117587 , ..., 0. , 0.5195822 ,
0.69436365]], dtype=float32)
:end:
def extract_features(file, model): # load the image as a 224x224 array img = load_img(file, target_size=(224,224)) # convert from 'PIL.Image.Image' to numpy array img = np.array(img) # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels) reshaped_img = img.reshape(1,224,224,3) # prepare image for model imgx = preprocess_input(reshaped_img) # get the feature vector features = model.predict(imgx, use_multiprocessing=True) return features
:results:
:end:
data = {} counter = 0
for poster in posters: counter += 1 print(f'{counter} : Extrating features for {poster}') feat = extract_features(f'{img_path}/{poster}', model) data[poster] = feat
:results:
:end:
filenames = np.array(list(data.keys()))
:results:
:end:
Serializing featureswith open(p,'wb') as file: pickle.dump(data, file)
:results:
:end:
with open(p, 'rb') as file: data = pickle.load(file)
:results:
:end:
feat = np.array(list(data.values())) feat.shape
:results:
(4917, 1, 4096)
:end:
feat = feat.reshape(-1,4096) feat.shape
:results:
(4917, 4096)
:end:
Reduce dimensionpca = PCA(n_components=100, random_state=22) pca.fit(feat) x = pca.transform(feat)
:results:
:end:
Clustering K-Meanssse = [] list_k = list(range(50, 500))
for k in list_k: km = KMeans(n_clusters=k, random_state=22) km.fit(x) print(f'{k} clusters') sse.append(km.inertia_)
:results:
:end:
plt.figure(figsize=(6, 6)) plt.plot(list_k, sse) plt.xlabel(r'Number of clusters *k*') plt.ylabel('Sum of squared distance');
:results:
kmeans = KMeans(n_clusters=200, random_state=22) kmeans.fit(x)
:results:
KMeans(n_clusters=200, random_state=22)
:end:
kmeans.labels_
:results:
array([85, 52, 99, ..., 69, 70, 85], dtype=int32)
:end:
groups = {} for file, cluster in zip(filenames, kmeans.labels_): if cluster not in groups.keys(): groups[cluster] = [] groups[cluster].append(file) else: groups[cluster].append(file)
:results:
:end:
Glimpse at the clustersdef view_cluster(cluster): plt.figure(figsize = (25,25)); # gets the list of filenames for a cluster files = groups[cluster] # only allow up to 30 images to be shown at a time if len(files) > 30: print(f"Clipping cluster size from {len(files)} to 30") files = files[:29] # plot each image in the cluster for index, file in enumerate(files): plt.subplot(10,10,index+1); img = load_img(f'{img_path}/{file}') img = np.array(img) plt.imshow(img) plt.axis('off')
view_cluster(105)
:results:
Bibliography Referencesbibliographystyle:unsrt bibliography:../references/recsys.bib
Local Variables noexportPress p or to see the previous file or, n or to see the next file
Are you sure you want to delete this access key?
Are you sure you want to delete this access key?
Are you sure you want to delete this access key?
Are you sure you want to delete this access key?