Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

TRN_recsys.org 16 KB

You have to be logged in to leave a comment. Sign In

💈 Développez un moteur de recommandation de films

Preamble Emacs Setup noexport

(setq org-src-fontify-natively t)

(setq lsp-semantic-tokens-enable t) (setq lsp-enable-symbol-highlighting t)

(setq lsp-enable-file-watchers nil read-process-output-max (* 1024 1024) gc-cons-threshold 100000000 lsp-idle-delay 0.5 ;; lsp-eldoc-hook nil lsp-eldoc-enable-hover nil

;;pas de fil d'ariane lsp-headerline-breadcrumb-enable nil ;; pas de imenu voir menu-list lsp-enable-imenu nil ;; lentille lsp-lens-enable t

lsp-semantic-highlighting t lsp-modeline-code-actions-enable t )

(setq lsp-completion-provider :company lsp-completion-show-detail t lsp-completion-show-kind t)

(setq lsp-ui-doc-enable t lsp-ui-doc-show-with-mouse nil lsp-ui-doc-show-with-cursor t lsp-ui-doc-use-childframe t

lsp-ui-sideline-diagnostic-max-line-length 80

;; lsp-ui-imenu lsp-ui-imenu-enable nil ;; lsp-ui-peek lsp-ui-peek-enable t ;; lsp-ui-sideline lsp-ui-sideline-enable t lsp-ui-sideline-ignore-duplicate t lsp-ui-sideline-show-symbol t lsp-ui-sideline-show-hover t lsp-ui-sideline-show-diagnostics t lsp-ui-sideline-show-code-actions t )

(setq lsp-diagnostics-provider :none lsp-modeline-diagnostics-enable nil lsp-signature-auto-activate nil ;; you could manually request them via `lsp-signature-activate` lsp-signature-render-documentation nil)

Imports

%matplotlib inline %load_ext autoreload %autoreload 2

import sys import os import warnings warnings.filterwarnings("ignore") import pickle

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns

from keras.preprocessing.image import load_img from keras.preprocessing.image import img_to_array from keras.applications.vgg16 import preprocess_input

from keras.applications.vgg16 import VGG16 from keras.models import Model

from sklearn.cluster import KMeans from sklearn.decomposition import PCA

from random import randint import pickle

:results:

:end:

Functions

def display_all(df): with pd.option_context("display.max_rows", 100, "display.max_columns", 100): display(df)

:results:

:end:

Org noexport

import IPython import tabulate

class OrgFormatter(IPython.core.formatters.BaseFormatter): format_type = IPython.core.formatters.Unicode('text/org') print_method = IPython.core.formatters.ObjectName('_repr_org_')

def pd_dataframe_to_org(df): return tabulate.tabulate(df, headers='keys', tablefmt='orgtbl', showindex='always')

ip = get_ipython() ip.display_formatter.formatters['text/org'] = OrgFormatter()

f = ip.display_formatter.formatters['text/org'] f.for_type_by_name('pandas.core.frame', 'DataFrame', pd_dataframe_to_org)

:results:

:end:

The Data Load Data

df = pd.read_csv('/Users/lss/Sites/simplon.ai/briefs/recsys/data/movie_metadata_prepared.csv')

:results: 0 - 06c3c7bc-1cc7-4984-a890-a11f3ea59b :end:

Glimpse at the data

display_all(df.describe(include='all').T)

:results:

count unique top freq mean std min 25% 50% 75% max
color 5024 2 Color 4815 nan nan nan nan nan nan nan
director_name 4939 2398 Steven Spielberg 26 nan nan nan nan nan nan nan
num_critic_for_reviews 4993 nan nan nan 140.194 121.602 1 50 110 195 813
duration 5028 nan nan nan 107.201 25.1974 7 93 103 118 511
director_facebook_likes 4939 nan nan nan 686.509 2813.33 0 7 49 194.5 23000
actor_3_facebook_likes 5020 nan nan nan 645.01 1665.04 0 133 371.5 636 23000
actor_2_name 5030 3032 Morgan Freeman 20 nan nan nan nan nan nan nan
actor_1_facebook_likes 5036 nan nan nan 6560.05 15020.8 0 614 988 11000 640000
gross 4159 nan nan nan 4.84684e+07 6.8453e+07 162 5.34099e+06 2.55175e+07 6.23094e+07 7.60506e+08
genres 5043 914 Drama 236 nan nan nan nan nan nan nan
actor_1_name 5036 2097 Robert De Niro 49 nan nan nan nan nan nan nan
movie_title 5043 4917 Ben-Hur 3 nan nan nan nan nan nan nan
num_voted_users 5043 nan nan nan 83668.2 138485 5 8593.5 34359 96309 1.68976e+06
cast_total_facebook_likes 5043 nan nan nan 9699.06 18163.8 0 1411 3090 13756.5 656730
actor_3_name 5020 3521 John Heard 8 nan nan nan nan nan nan nan
facenumber_in_poster 5030 nan nan nan 1.37117 2.01358 0 0 1 2 43
plot_keywords 4890 4760 based on novel 4 nan nan nan nan nan nan nan
movie_imdb_link 5043 4919 http://www.imdb.com/title/tt0232500/?ref_=fn_tt_tt_1 3 nan nan nan nan nan nan nan
num_user_for_reviews 5022 nan nan nan 272.771 377.983 1 65 156 326 5060
language 5031 47 English 4704 nan nan nan nan nan nan nan
country 5038 65 USA 3807 nan nan nan nan nan nan nan
content_rating 4740 18 R 2118 nan nan nan nan nan nan nan
budget 4551 nan nan nan 3.97526e+07 2.06115e+08 218 6e+06 2e+07 4.5e+07 1.22155e+10
title_year 4935 nan nan nan 2002.47 12.4746 1916 1999 2005 2011 2016
actor_2_facebook_likes 5030 nan nan nan 1651.75 4042.44 0 281 595 918 137000
imdb_score 5043 nan nan nan 6.44214 1.12512 1.6 5.8 6.6 7.2 9.5
aspect_ratio 4714 nan nan nan 2.2204 1.38511 1.18 1.85 2.35 2.35 16
movie_facebook_likes 5043 nan nan nan 7525.96 19320.4 0 0 166 3000 349000
end

numerical = df_raw.select_dtypes(include='number').columns categorical = df_raw.select_dtypes(exclude='number').columns

print(f"categorical columns are : {', '.join(str(x) for x in categorical)}") print(f"numerical columns are : {', '.join(str(x) for x in numerical)}")

categorical columns are : color, director_name, actor_2_name, genres, actor_1_name, movie_title, actor_3_name, plot_keywords, movie_imdb_link, language, country, content_rating

numerical columns are : num_critic_for_reviews, duration, director_facebook_likes, actor_3_facebook_likes, actor_1_facebook_likes, gross, num_voted_users, cast_total_facebook_likes, facenumber_in_poster, num_user_for_reviews, budget, title_year, actor_2_facebook_likes, imdb_score, aspect_ratio, movie_facebook_likes

path = "/Users/lss//Sites/simplon.ai/briefs/recsys/data/posters"

os.chdir(path)

posters = []

with os.scandir(path) as files: # loops through each file in the directory for file in files: if file.name.endswith('.jpg'): # adds only the image files to the flowers list posters.append(file.name)

:results:

:end:

img = load_img(posters[0], target_size=(224,224))

img = np.array(img)

print(img.shape)

:results:

:end:

reshaped_img = img.reshape(1,224,224,3) print(reshaped_img.shape)

:results:

:end:

Preprocessing

x = preprocess_input(reshaped_img)

:results:

:end:

The Model Architecture

Nous pouvons maintenant charger le modèle VGG et supprimer la couche de sortie manuellement. Cela signifie que la nouvelle couche finale est une couche entièrement connectée avec 4 096 nœuds de sortie. Ce vecteur de 4 096 nombres est le vecteur de caractéristiques que nous utiliserons pour regrouper les images.

model = VGG16()

model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

:results:

:end:

features = model.predict(x, use_multiprocessing=True)

:results:

:end:

print(features.shape) features

:results:


  array([[0.4690976, 0.       , 0.       , ..., 0.       , 0.       ,
  3.0501497]], dtype=float32)

:end:

Bibliography References

bibliographystyle:unsrt bibliography:recsys.bib

Local Variables noexport
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...