Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

no.org 23 KB

You have to be logged in to leave a comment. Sign In

💈 Développez un moteur de recommandation de films

Preamble Emacs Setup noexport

(setq org-src-fontify-natively t) (setq org-latex-image-default-width "5cm") (setq org-image-actual-width nil)

Imports

%matplotlib inline %load_ext autoreload %autoreload 2

import sys import os import warnings warnings.filterwarnings("ignore") import pickle

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns

import tensorflow as tf

from keras.preprocessing.image import load_img from keras.preprocessing.image import img_to_array from keras.applications.vgg16 import preprocess_input

from keras.applications.vgg16 import VGG16 from keras.models import Model

from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.metrics.pairwise import cosine_similarity

from random import randint import pickle

:results:

:end:

Functions

def display_all(df): with pd.option_context("display.max_rows", 100, "display.max_columns", 100): display(df)

:results: 1 - e21f656d-1695-428d-8b46-481324f670 :end:

Org noexport

import IPython import tabulate

class OrgFormatter(IPython.core.formatters.BaseFormatter): format_type = IPython.core.formatters.Unicode('text/org') print_method = IPython.core.formatters.ObjectName('_repr_org_')

def pd_dataframe_to_org(df): return tabulate.tabulate(df, headers='keys', tablefmt='orgtbl', showindex='always')

ip = get_ipython() ip.display_formatter.formatters['text/org'] = OrgFormatter()

f = ip.display_formatter.formatters['text/org'] f.for_type_by_name('pandas.core.frame', 'DataFrame', pd_dataframe_to_org)

:results:

:end:

Load Data Load Data

df = pd.read_csv('../data/processed/movie_metadata_processed.csv')

:results:

:end:

Glimpse at the data

display_all(df.describe(include='all').T)

:results:

count unique top freq mean std min 25% 50% 75% max
Unnamed: 0 4688 nan nan nan 2343.5 1353.45 0 1171.75 2343.5 3515.25 4687
id 4688 4688 tt0006864 1 nan nan nan nan nan nan nan
color 4673 2 Color 4477 nan nan nan nan nan nan nan
director_name 4688 2370 Steven Spielberg 26 nan nan nan nan nan nan nan
actor_3_facebook_likes 4670 nan nan nan 620.241 1591.97 0 130 362 632.75 23000
actor_2_name 4678 2923 Morgan Freeman 16 nan nan nan nan nan nan nan
actor_1_facebook_likes 4681 nan nan nan 6548.67 15335.4 0 606 984 11000 640000
actor_1_name 4681 2020 Robert De Niro 47 nan nan nan nan nan nan nan
movie_title 4688 4686 The Host 2 nan nan nan nan nan nan nan
actor_3_name 4670 3389 Steve Coogan 8 nan nan nan nan nan nan nan
plot_keywords 4553 4545 based on novel 4 nan nan nan nan nan nan nan
language 4688 47 English 4376 nan nan nan nan nan nan nan
country 4687 65 USA 3540 nan nan nan nan nan nan nan
content_rating 4688 12 R 2021 nan nan nan nan nan nan nan
title_year 4688 nan nan nan 2002.42 12.4284 1916 1999 2005 2011 2016
actor_2_facebook_likes 4678 nan nan nan 1614.51 4017.5 0 275 592 912 137000
aspect_ratio 4388 nan nan nan 2.12791 0.807937 1.18 1.85 2.35 2.35 16
Action 4688 nan nan nan 0.225043 0.417655 0 0 0 0 1
Adventure 4688 nan nan nan 0.180461 0.384612 0 0 0 0 1
Animation 4688 nan nan nan 0.0492747 0.216464 0 0 0 0 1
Biography 4688 nan nan nan 0.0612201 0.239759 0 0 0 0 1
Comedy 4688 nan nan nan 0.380119 0.485468 0 0 0 1 1
Crime 4688 nan nan nan 0.174275 0.379386 0 0 0 0 1
Documentary 4688 nan nan nan 0.025384 0.157305 0 0 0 0 1
Drama 4688 nan nan nan 0.512159 0.499905 0 0 1 1 1
Family 4688 nan nan nan 0.108575 0.311139 0 0 0 0 1
Fantasy 4688 nan nan nan 0.116254 0.320564 0 0 0 0 1
Film-Noir 4688 nan nan nan 0.00127986 0.0357561 0 0 0 0 1
History 4688 nan nan nan 0.0415956 0.199684 0 0 0 0 1
Horror 4688 nan nan nan 0.108575 0.311139 0 0 0 0 1
Music 4688 nan nan nan 0.0441553 0.205462 0 0 0 0 1
Musical 4688 nan nan nan 0.0273038 0.162984 0 0 0 0 1
Mystery 4688 nan nan nan 0.0949232 0.29314 0 0 0 0 1
News 4688 nan nan nan 0.000639932 0.0252915 0 0 0 0 1
Romance 4688 nan nan nan 0.222056 0.415673 0 0 0 0 1
Sci-Fi 4688 nan nan nan 0.119027 0.323855 0 0 0 0 1
Short 4688 nan nan nan 0.00106655 0.0326442 0 0 0 0 1
Sport 4688 nan nan nan 0.0364761 0.187492 0 0 0 0 1
Thriller 4688 nan nan nan 0.275384 0.446755 0 0 0 1 1
War 4688 nan nan nan 0.043302 0.203558 0 0 0 0 1
Western 4688 nan nan nan 0.019198 0.137235 0 0 0 0 1
num_critic_for_reviews 4688 nan nan nan 137.259 119.006 1 50 108 190 813
duration 4688 nan nan nan 107.961 22.6209 7 94 103 118 330
gross 4688 nan nan nan 4.06176e+07 6.27544e+07 162 2.8341e+06 1.72871e+07 5.18582e+07 7.60506e+08
director_facebook_likes 4688 nan nan nan 696.231 2853.72 0 7 47 188.25 23000
num_voted_users 4688 nan nan nan 82861.6 139306 5 8428.5 32871 93011 1.68976e+06
cast_total_facebook_likes 4688 nan nan nan 9607.98 18324.4 0 1378.75 3062.5 13650.2 656730
facenumber_in_poster 4688 nan nan nan 1.37521 2.03111 0 0 1 2 43
num_user_for_reviews 4688 nan nan nan 265.88 371.625 1 64 153 318 5060
budget 4688 nan nan nan 3.67286e+07 2.02791e+08 218 5.43e+06 1.7e+07 4e+07 1.22155e+10
imdb_score 4688 nan nan nan 6.41086 1.11889 1.6 5.8 6.5 7.2 9.3
movie_facebook_likes 4688 nan nan nan 7216.21 19170.2 0 0 154 2000 349000
end

numerical = df.select_dtypes(include='number').columns categorical = df.select_dtypes(exclude='number').columns

print(f"categorical columns are : {', '.join(str(x) for x in categorical)}") print(f"numerical columns are : {', '.join(str(x) for x in numerical)}")

:results: categorical columns are : id, color, director_name, actor_2_name, actor_1_name, movie_title, actor_3_name, plot_keywords, language, country, content_rating numerical columns are : Unnamed: 0, actor_3_facebook_likes, actor_1_facebook_likes, title_year, actor_2_facebook_likes, aspect_ratio, Action, Adventure, Animation, Biography, Comedy, Crime, Documentary, Drama, Family, Fantasy, Film-Noir, History, Horror, Music, Musical, Mystery, News, Romance, Sci-Fi, Short, Sport, Thriller, War, Western, num_critic_for_reviews, duration, gross, director_facebook_likes, num_voted_users, cast_total_facebook_likes, facenumber_in_poster, num_user_for_reviews, budget, imdb_score, movie_facebook_likes :end:

categorical columns are : color, director_name, actor_2_name, genres, actor_1_name, movie_title, actor_3_name, plot_keywords, movie_imdb_link, language, country, content_rating

numerical columns are : num_critic_for_reviews, duration, director_facebook_likes, actor_3_facebook_likes, actor_1_facebook_likes, gross, num_voted_users, cast_total_facebook_likes, facenumber_in_poster, num_user_for_reviews, budget, title_year, actor_2_facebook_likes, imdb_score, aspect_ratio, movie_facebook_likes

Posters

print(os.getcwd()) #os.chdir('./mauviette/notebooks')

:results:

:end:

img_path = '../data/external/posters/' p = r"../data/processed/poster_features.pkl"

:results:

:end:

posters = []

with os.scandir(img_path) as files: # loops through each file in the directory for file in files: if file.name.endswith('.jpg'): # adds only the image files to the posters list posters.append(file.name)

:results:

:end:

Glimpse at a poster

img = load_img(f'{img_path}/{posters[0]}', target_size=(224,224))

img = np.array(img)

print(img.shape)

:results:

:end:

reshaped_img = img.reshape(1,224,224,3) print(reshaped_img.shape)

:results:

:end:

x = preprocess_input(reshaped_img)

:results:

:end:

Clustering Extracting features

Nous pouvons maintenant charger le modèle VGG et supprimer la couche de sortie manuellement. Cela signifie que la nouvelle couche finale est une couche entièrement connectée avec 4 096 nœuds de sortie. Ce vecteur de 4 096 nombres est le vecteur de caractéristiques que nous utiliserons pour regrouper les images.

model = VGG16()

model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

:results:

:end:

features = model.predict(x, use_multiprocessing=True)

:results:

:end:

print(features.shape) features

:results:


  array([[1.5461166 , 6.260435  , 2.117587  , ..., 0.        , 0.5195822 ,
  0.69436365]], dtype=float32)

:end:

  • Nous résumons l'ensemble du prototype ci-dessus en une fonction :

def extract_features(file, model): # load the image as a 224x224 array img = load_img(file, target_size=(224,224)) # convert from 'PIL.Image.Image' to numpy array img = np.array(img) # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels) reshaped_img = img.reshape(1,224,224,3) # prepare image for model imgx = preprocess_input(reshaped_img) # get the feature vector features = model.predict(imgx, use_multiprocessing=True) return features

:results:

:end:

  • Nous appliquons cette fonction à toutes nos affiches :

data = {} counter = 0

for poster in posters: counter += 1 print(f'{counter} : Extrating features for {poster}') feat = extract_features(f'{img_path}/{poster}', model) data[poster] = feat

:results:

:end:

filenames = np.array(list(data.keys()))

:results:

:end:

Serializing features

with open(p,'wb') as file: pickle.dump(data, file)

:results:

:end:

with open(p, 'rb') as file: data = pickle.load(file)

:results:

:end:

  • get a list of just the features

feat = np.array(list(data.values())) feat.shape

:results:

(4917, 1, 4096)

:end:

feat = feat.reshape(-1,4096) feat.shape

:results:

(4917, 4096)

:end:

Reduce dimension

pca = PCA(n_components=100, random_state=22) pca.fit(feat) x = pca.transform(feat)

:results:

:end:

Clustering K-Means

sse = [] list_k = list(range(50, 500))

for k in list_k: km = KMeans(n_clusters=k, random_state=22) km.fit(x) print(f'{k} clusters') sse.append(km.inertia_)

:results:

:end:

plt.figure(figsize=(6, 6)) plt.plot(list_k, sse) plt.xlabel(r'Number of clusters *k*') plt.ylabel('Sum of squared distance');

:results:

./obipy-resources/jdS27K.png :end:

kmeans = KMeans(n_clusters=200, random_state=22) kmeans.fit(x)

:results:

KMeans(n_clusters=200, random_state=22)

:end:

kmeans.labels_

:results:

array([85, 52, 99, ..., 69, 70, 85], dtype=int32)

:end:

groups = {} for file, cluster in zip(filenames, kmeans.labels_): if cluster not in groups.keys(): groups[cluster] = [] groups[cluster].append(file) else: groups[cluster].append(file)

:results:

:end:

Glimpse at the clusters

def view_cluster(cluster): plt.figure(figsize = (25,25)); # gets the list of filenames for a cluster files = groups[cluster] # only allow up to 30 images to be shown at a time if len(files) > 30: print(f"Clipping cluster size from {len(files)} to 30") files = files[:29] # plot each image in the cluster for index, file in enumerate(files): plt.subplot(10,10,index+1); img = load_img(f'{img_path}/{file}') img = np.array(img) plt.imshow(img) plt.axis('off')

view_cluster(105)

:results:

./obipy-resources/0ll00e.png :end:

Bibliography References

bibliographystyle:unsrt bibliography:../references/recsys.bib

Local Variables noexport
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...