💈 Développez un moteur de recommandation de films

%matplotlib inline %load_ext autoreload %autoreload 2

import sys import os import warnings warnings.filterwarnings("ignore") import pickle

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns

from keras.preprocessing.image import load_img from keras.preprocessing.image import img_to_array from keras.applications.vgg16 import preprocess_input

from keras.applications.vgg16 import VGG16 from keras.models import Model

from sklearn.cluster import KMeans from sklearn.decomposition import PCA

from random import randint import pickle




def display_all(df): with pd.option_context("display.max_rows", 100, "display.max_columns", 100): display(df)



The Data Load Data

df = pd.read_csv('/Users/lss/Sites/')

Glimpse at the data



count unique top freq mean std min 25% 50% 75% max
color 5024 2 Color 4815 nan nan nan nan nan nan nan
director_name 4939 2398 Steven Spielberg 26 nan nan nan nan nan nan nan
num_critic_for_reviews 4993 nan nan nan 140.194 121.602 1 50 110 195 813
duration 5028 nan nan nan 107.201 25.1974 7 93 103 118 511
director_facebook_likes 4939 nan nan nan 686.509 2813.33 0 7 49 194.5 23000
actor_3_facebook_likes 5020 nan nan nan 645.01 1665.04 0 133 371.5 636 23000
actor_2_name 5030 3032 Morgan Freeman 20 nan nan nan nan nan nan nan
actor_1_facebook_likes 5036 nan nan nan 6560.05 15020.8 0 614 988 11000 640000
gross 4159 nan nan nan 4.84684e+07 6.8453e+07 162 5.34099e+06 2.55175e+07 6.23094e+07 7.60506e+08
genres 5043 914 Drama 236 nan nan nan nan nan nan nan
actor_1_name 5036 2097 Robert De Niro 49 nan nan nan nan nan nan nan
movie_title 5043 4917 Ben-Hur 3 nan nan nan nan nan nan nan
num_voted_users 5043 nan nan nan 83668.2 138485 5 8593.5 34359 96309 1.68976e+06
cast_total_facebook_likes 5043 nan nan nan 9699.06 18163.8 0 1411 3090 13756.5 656730
actor_3_name 5020 3521 John Heard 8 nan nan nan nan nan nan nan
facenumber_in_poster 5030 nan nan nan 1.37117 2.01358 0 0 1 2 43
plot_keywords 4890 4760 based on novel 4 nan nan nan nan nan nan nan
movie_imdb_link 5043 4919 3 nan nan nan nan nan nan nan
num_user_for_reviews 5022 nan nan nan 272.771 377.983 1 65 156 326 5060
language 5031 47 English 4704 nan nan nan nan nan nan nan
country 5038 65 USA 3807 nan nan nan nan nan nan nan
content_rating 4740 18 R 2118 nan nan nan nan nan nan nan
budget 4551 nan nan nan 3.97526e+07 2.06115e+08 218 6e+06 2e+07 4.5e+07 1.22155e+10
title_year 4935 nan nan nan 2002.47 12.4746 1916 1999 2005 2011 2016
actor_2_facebook_likes 5030 nan nan nan 1651.75 4042.44 0 281 595 918 137000
imdb_score 5043 nan nan nan 6.44214 1.12512 1.6 5.8 6.6 7.2 9.5
aspect_ratio 4714 nan nan nan 2.2204 1.38511 1.18 1.85 2.35 2.35 16
movie_facebook_likes 5043 nan nan nan 7525.96 19320.4 0 0 166 3000 349000

numerical = df_raw.select_dtypes(include='number').columns categorical = df_raw.select_dtypes(exclude='number').columns

print(f"categorical columns are : {', '.join(str(x) for x in categorical)}") print(f"numerical columns are : {', '.join(str(x) for x in numerical)}")

categorical columns are : color, director_name, actor_2_name, genres, actor_1_name, movie_title, actor_3_name, plot_keywords, movie_imdb_link, language, country, content_rating

numerical columns are : num_critic_for_reviews, duration, director_facebook_likes, actor_3_facebook_likes, actor_1_facebook_likes, gross, num_voted_users, cast_total_facebook_likes, facenumber_in_poster, num_user_for_reviews, budget, title_year, actor_2_facebook_likes, imdb_score, aspect_ratio, movie_facebook_likes

path = "/Users/lss//Sites/"


posters = []

with os.scandir(path) as files: # loops through each file in the directory for file in files: if'.jpg'): # adds only the image files to the flowers list posters.append(



img = load_img(posters[0], target_size=(224,224))

img = np.array(img)




reshaped_img = img.reshape(1,224,224,3) print(reshaped_img.shape)




x = preprocess_input(reshaped_img)



The Model Architecture

Nous pouvons maintenant charger le modèle VGG et supprimer la couche de sortie manuellement. Cela signifie que la nouvelle couche finale est une couche entièrement connectée avec 4 096 nœuds de sortie. Ce vecteur de 4 096 nombres est le vecteur de caractéristiques que nous utiliserons pour regrouper les images.

model = VGG16()

model = Model(inputs=model.inputs, outputs=model.layers[-2].output)



features = model.predict(x, use_multiprocessing=True)



print(features.shape) features


  array([[0.4690976, 0.       , 0.       , ..., 0.       , 0.       ,
  3.0501497]], dtype=float32)


