#importing necessary libraries
import pandas as pd
import numpy as np

%matplotlib inline

# Load my data and the size of it
df = pd.read_csv('tmdb-movies.csv')

df.shape   #(10866, 21)

(10866, 21)

df.head(10)

#getting more info about columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10866 entries, 0 to 10865
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10866 non-null  int64  
 1   imdb_id               10856 non-null  object 
 2   popularity            10866 non-null  float64
 3   budget                10866 non-null  int64  
 4   revenue               10866 non-null  int64  
 5   original_title        10866 non-null  object 
 6   cast                  10790 non-null  object 
 7   homepage              2936 non-null   object 
 8   director              10822 non-null  object 
 9   tagline               8042 non-null   object 
 10  keywords              9373 non-null   object 
 11  overview              10862 non-null  object 
 12  runtime               10866 non-null  int64  
 13  genres                10843 non-null  object 
 14  production_companies  9836 non-null   object 
 15  release_date          10866 non-null  object 
 16  vote_count            10866 non-null  int64  
 17  vote_average          10866 non-null  float64
 18  release_year          10866 non-null  int64  
 19  budget_adj            10866 non-null  float64
 20  revenue_adj           10866 non-null  float64
dtypes: float64(4), int64(6), object(11)
memory usage: 1.7+ MB

# After discussing the structure of the data and any problems that need to be cleaned

#I started by removing unnecessary columnss for my investigation
#to optimize memory usage
df.drop(['imdb_id', 'release_date', 'original_title', 'cast', 'homepage', 'tagline', 'keywords', 'overview', 'production_companies' , 'revenue_adj' , 'budget_adj'], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10866 entries, 0 to 10865
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            10866 non-null  int64  
 1   popularity    10866 non-null  float64
 2   budget        10866 non-null  int64  
 3   revenue       10866 non-null  int64  
 4   director      10822 non-null  object 
 5   runtime       10866 non-null  int64  
 6   genres        10843 non-null  object 
 7   vote_count    10866 non-null  int64  
 8   vote_average  10866 non-null  float64
 9   release_year  10866 non-null  int64  
dtypes: float64(2), int64(6), object(2)
memory usage: 849.0+ KB

#removing duplicates
sum(df.duplicated())
df.drop_duplicates(inplace=True)

#dropping null values rows for director & genres columns
#as genre & directors are central to some questions.dropping those rows may be best since they will affect my results
df.dropna(inplace=True) 

print(df['director'].isnull().sum())    #to make sure no null values are left
print(df['genres'].isnull().sum())      #to make sure no null values are left

df.info()

0
0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10800 entries, 0 to 10865
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            10800 non-null  int64  
 1   popularity    10800 non-null  float64
 2   budget        10800 non-null  int64  
 3   revenue       10800 non-null  int64  
 4   director      10800 non-null  object 
 5   runtime       10800 non-null  int64  
 6   genres        10800 non-null  object 
 7   vote_count    10800 non-null  int64  
 8   vote_average  10800 non-null  float64
 9   release_year  10800 non-null  int64  
dtypes: float64(2), int64(6), object(2)
memory usage: 928.1+ KB

 # checking if budget col has any 0 values
df[df['budget']==0].shape

(5636, 10)

#getting budget mean for non-zero values
budget_mean = df['budget'][df['budget'] > 0].mean()
budget_mean

30766902.224825718

#replacing each 0 value with the mean
df['budget'].replace(0, budget_mean, inplace=True)

#checking if non-zero values were replaced
df[df['budget']==0].shape

(0, 10)

 # checking if revenue col has any 0 values
df[df['revenue']==0].shape

(5952, 10)

#getting revenue mean for non-zero values
rev_mean =df['revenue'][df['revenue'] > 0].mean()
rev_mean

89254997.08642739

#replacing each 0 value with the mean
df['revenue'].replace(0, rev_mean, inplace=True)
df[df['revenue']==0].shape

(0, 10)

 # checking if runtime col has any 0 values
df[df['runtime']==0].shape

#getting revenue mean for non-zero values
runtime_mean =df['runtime'][df['runtime'] > 0].mean()

#replacing each 0 value with the mean
df['runtime'].replace(0, runtime_mean , inplace=True)

 #creating a function that prints out the max & min values of a column
def Min_Max(a):
    print("Minimum Value:" , a.min())
    print("Maximum Value:", a.max())

#creating a functions that prints out the floats & ints ranges
def floats_ranges():
        print("float16: min:" , np.finfo("float16").min , "| max: ", np.finfo("float16").max )
        print("float32: min:" , np.finfo("float32").min , "| max: ", np.finfo("float32").max )
        print("float64: min:" , np.finfo("float64").min , "| max: ", np.finfo("float64").max )
       
    
def ints_ranges():
        print(np.iinfo("int8"))
        print(np.iinfo("int16"))
        print(np.iinfo("int32"))
        print(np.iinfo("int64"))

#checking if the data type int64 suits the range of the value
Min_Max(df['revenue'])

#no need to be changed since its type suits the range

Minimum Value: 2.0
Maximum Value: 2781505847.0

#checking if the data type float64 suits the range of the value
Min_Max(df['popularity'])

Minimum Value: 0.000188
Maximum Value: 32.985763

#checking for a suitable data type range
floats_ranges()

float16: min: -65500.0 | max:  65500.0
float32: min: -3.4028235e+38 | max:  3.4028235e+38
float64: min: -1.7976931348623157e+308 | max:  1.7976931348623157e+308

# it appears that float16 is the most suitable
#changing its data type for memory optimization
df['popularity'] = df['popularity'].astype('float16')

#checking if the data type int64 suits the range of the value
Min_Max(df['id'])

#checking for a better data type range
ints_ranges()

#changing its data type for memory optimization
df['id'] = df['id'].astype('int32')

Minimum Value: 5
Maximum Value: 417859
Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------

Machine parameters for int32
---------------------------------------------------------------
min = -2147483648
max = 2147483647
---------------------------------------------------------------

Machine parameters for int64
---------------------------------------------------------------
min = -9223372036854775808
max = 9223372036854775807
---------------------------------------------------------------

#checking if the data type int64 suits the range of the value
Min_Max(df['budget'])
#checking for a better data type range
ints_ranges()
#changing its data type for memory optimization
df['budget'] = df['budget'].astype('int32')

Minimum Value: 1.0
Maximum Value: 425000000.0
Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------

Machine parameters for int32
---------------------------------------------------------------
min = -2147483648
max = 2147483647
---------------------------------------------------------------

Machine parameters for int64
---------------------------------------------------------------
min = -9223372036854775808
max = 9223372036854775807
---------------------------------------------------------------

#check for unique values to see if its possible to convert to int (there are no floats)
df['runtime'].unique()

#checking if the data type int64 suits the range of the value
Min_Max(df['runtime'])

#changing its data type for memory optimization
ints_ranges()

#changing for a better data type range
df['runtime'] = df['runtime'].astype('int16')

Minimum Value: 2.0
Maximum Value: 900.0
Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------

Machine parameters for int32
---------------------------------------------------------------
min = -2147483648
max = 2147483647
---------------------------------------------------------------

Machine parameters for int64
---------------------------------------------------------------
min = -9223372036854775808
max = 9223372036854775807
---------------------------------------------------------------

#since genres column has multiple values splitted by '|'
# i decided to split them & save genres in an array
df['genres'] = df['genres'].str.split('|')
df['genres']

0        [Action, Adventure, Science Fiction, Thriller]
1        [Action, Adventure, Science Fiction, Thriller]
2                [Adventure, Science Fiction, Thriller]
3         [Action, Adventure, Science Fiction, Fantasy]
4                             [Action, Crime, Thriller]
                              ...                      
10861                                     [Documentary]
10862                        [Action, Adventure, Drama]
10863                                 [Mystery, Comedy]
10864                                  [Action, Comedy]
10865                                          [Horror]
Name: genres, Length: 10800, dtype: object

#checking if its possible to change their data type from object to category
print(df['director'].nunique())
print(df['release_year'].nunique())

#better to convert it to category to save memory
df['director'] = df['director'].astype('category')
df['release_year'] = df['release_year'].astype('category')

5056
56

#checking for a better data type range
Min_Max(df['vote_count'])

#checking if the data type int64 suits the range of the value
ints_ranges()

#changing its data type for memory optimization
df['vote_count'] = df['vote_count'].astype('int16')

Minimum Value: 10
Maximum Value: 9767
Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------

Machine parameters for int32
---------------------------------------------------------------
min = -2147483648
max = 2147483647
---------------------------------------------------------------

Machine parameters for int64
---------------------------------------------------------------
min = -9223372036854775808
max = 9223372036854775807
---------------------------------------------------------------

#checking for a better data type range
Min_Max(df['vote_average'])

#checking if the data type int64 suits the range of the value
floats_ranges()

#changing its data type for memory optimization
df['vote_average'] = df['vote_average'].astype('float16')

Minimum Value: 1.5
Maximum Value: 9.2
float16: min: -65500.0 | max:  65500.0
float32: min: -3.4028235e+38 | max:  3.4028235e+38
float64: min: -1.7976931348623157e+308 | max:  1.7976931348623157e+308

#to make sure we dont have to deal with zero or null values
for c in df.columns:
    print(df[df[c]==0].shape , df[df[c]==0].isnull() , c)

(0, 10) Empty DataFrame
Columns: [id, popularity, budget, revenue, director, runtime, genres, vote_count, vote_average, release_year]
Index: [] id
(0, 10) Empty DataFrame
Columns: [id, popularity, budget, revenue, director, runtime, genres, vote_count, vote_average, release_year]
Index: [] popularity
(0, 10) Empty DataFrame
Columns: [id, popularity, budget, revenue, director, runtime, genres, vote_count, vote_average, release_year]
Index: [] budget
(0, 10) Empty DataFrame
Columns: [id, popularity, budget, revenue, director, runtime, genres, vote_count, vote_average, release_year]
Index: [] revenue
(0, 10) Empty DataFrame
Columns: [id, popularity, budget, revenue, director, runtime, genres, vote_count, vote_average, release_year]
Index: [] director
(0, 10) Empty DataFrame
Columns: [id, popularity, budget, revenue, director, runtime, genres, vote_count, vote_average, release_year]
Index: [] runtime
(0, 10) Empty DataFrame
Columns: [id, popularity, budget, revenue, director, runtime, genres, vote_count, vote_average, release_year]
Index: [] genres
(0, 10) Empty DataFrame
Columns: [id, popularity, budget, revenue, director, runtime, genres, vote_count, vote_average, release_year]
Index: [] vote_count
(0, 10) Empty DataFrame
Columns: [id, popularity, budget, revenue, director, runtime, genres, vote_count, vote_average, release_year]
Index: [] vote_average
(0, 10) Empty DataFrame
Columns: [id, popularity, budget, revenue, director, runtime, genres, vote_count, vote_average, release_year]
Index: [] release_year

#data set after cleaned and memory optimized
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10800 entries, 0 to 10865
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   id            10800 non-null  int32   
 1   popularity    10800 non-null  float16 
 2   budget        10800 non-null  int32   
 3   revenue       10800 non-null  float64 
 4   director      10800 non-null  category
 5   runtime       10800 non-null  int16   
 6   genres        10800 non-null  object  
 7   vote_count    10800 non-null  int16   
 8   vote_average  10800 non-null  float16 
 9   release_year  10800 non-null  category
dtypes: category(2), float16(2), float64(1), int16(2), int32(2), object(1)
memory usage: 624.5+ KB

df.head(100)

df.plot(y='runtime' , x='popularity' , kind="scatter");

df.plot(y='runtime' , x='revenue', kind="scatter" );

#if we further look at runtime column corrolations with other columns
pd.plotting.scatter_matrix(df.iloc[:, 1:] , figsize=(15,15));
# i used df.iloc[:, 1:] to draw the matrix without the id col

# assuming the high revenue movies are the ones higher than 50% of the movies
high_revenue = df['revenue'].quantile(0.5)

# getting the hight 50% rvenue movies
high_revenue_df = df[df['revenue'] >= high_revenue]

# counting directors with high revenue movies
director_counts = high_revenue_df['director'].value_counts()

director_counts.head()

Woody Allen         26
Steven Spielberg    25
Clint Eastwood      19
Martin Scorsese     19
Ron Howard          16
Name: director, dtype: int64

#getting top 15 direcots
top_directors = director_counts.head(15)

# plotting high revenue movies Vs their directors
top_directors.plot(figsize=(5,5), title="Top 15 Directors with the Most High-Revenue Movies" , xlabel="Director", ylabel="Number of High Revenue Movies" , kind='bar', color='blue');

#lets get most popular movies
most_pop = df['popularity'].quantile(0.5)

# getting the hight 50% rvenue movies
most_pop_df = df[df['popularity'] >= most_pop]

# counting most popular movies according to their release year
years_count = most_pop_df['release_year'].value_counts()

years_count

2014    339
2015    320
2013    298
2011    292
2009    264
2012    261
2010    253
2008    245
2007    233
2006    210
2005    194
2004    174
2003    150
2002    149
2001    133
1996    121
1998    120
1997    115
1995    113
2000    110
1999    108
1994    105
1993    103
1992     72
1990     66
1989     62
1986     62
1988     58
1985     58
1991     57
1987     55
1984     49
1982     39
1983     37
1980     30
1981     28
1979     28
1978     28
1973     25
1977     22
1971     20
1975     19
1968     18
1976     18
1974     16
1963     14
1967     14
1969     13
1972     12
1962     12
1966     12
1960     11
1970     11
1964     10
1961      9
1965      7
Name: release_year, dtype: int64

#plotting a graph for most 15 popular movies according to their release year
years_count.head(15).plot(figsize=(5,5), title="Top 15 most Popular movies with their realease year" , xlabel="Release Year", ylabel="Popularity" , kind='bar', color='green');

#we already got the high revenue movies in the variable high_revenue_df
#now i wanna get their categories at first

#genres are saved in series. i want to explode better for investigation
high_revenue_df = high_revenue_df.assign(genres=high_revenue_df['genres']).explode('genres') #chatgpt

#getting genres with high revenue movies
top_genres = high_revenue_df['genres'].value_counts()

top_genres

Drama              2941
Comedy             2500
Thriller           1873
Action             1606
Horror             1176
Romance            1041
Adventure          1031
Family              955
Science Fiction     865
Crime               769
Fantasy             672
Animation           587
Mystery             516
Documentary         410
Music               262
History             216
War                 181
TV Movie            161
Foreign             155
Western             118
Name: genres, dtype: int64

#plotting graph
top_genres.head(15).plot(figsize=(5,5), title="Top 15 Genres with the Most High-Revenue Movies" , xlabel="Genre", ylabel="Number of High Revenue Movies" , kind='bar', color='red');

#since we have a high revenue df
#let me draw a matrix to view its corrolations better

pd.plotting.scatter_matrix(high_revenue_df.iloc[: , 1:] , figsize=(10,10));

# Running this cell will execute a bash command to convert this notebook to an .html file
!python -m nbconvert --to html Investigate_a_Dataset.ipynb

[NbConvertApp] Converting notebook Investigate_a_Dataset.ipynb to html
[NbConvertApp] WARNING | Alternative text is missing on 7 image(s).
[NbConvertApp] Writing 1347661 bytes to Investigate_a_Dataset.html

	id	imdb_id	popularity	budget	revenue	original_title	cast	homepage	director	tagline	...	overview	runtime	genres	production_companies	release_date	vote_count	vote_average	release_year	budget_adj	revenue_adj
0	135397	tt0369610	32.985763	150000000	1513528810	Jurassic World	Chris Pratt\|Bryce Dallas Howard\|Irrfan Khan\|Vi...	http://www.jurassicworld.com/	Colin Trevorrow	The park is open.	...	Twenty-two years after the events of Jurassic ...	124	Action\|Adventure\|Science Fiction\|Thriller	Universal Studios\|Amblin Entertainment\|Legenda...	6/9/15	5562	6.5	2015	1.379999e+08	1.392446e+09
1	76341	tt1392190	28.419936	150000000	378436354	Mad Max: Fury Road	Tom Hardy\|Charlize Theron\|Hugh Keays-Byrne\|Nic...	http://www.madmaxmovie.com/	George Miller	What a Lovely Day.	...	An apocalyptic story set in the furthest reach...	120	Action\|Adventure\|Science Fiction\|Thriller	Village Roadshow Pictures\|Kennedy Miller Produ...	5/13/15	6185	7.1	2015	1.379999e+08	3.481613e+08
2	262500	tt2908446	13.112507	110000000	295238201	Insurgent	Shailene Woodley\|Theo James\|Kate Winslet\|Ansel...	http://www.thedivergentseries.movie/#insurgent	Robert Schwentke	One Choice Can Destroy You	...	Beatrice Prior must confront her inner demons ...	119	Adventure\|Science Fiction\|Thriller	Summit Entertainment\|Mandeville Films\|Red Wago...	3/18/15	2480	6.3	2015	1.012000e+08	2.716190e+08
3	140607	tt2488496	11.173104	200000000	2068178225	Star Wars: The Force Awakens	Harrison Ford\|Mark Hamill\|Carrie Fisher\|Adam D...	http://www.starwars.com/films/star-wars-episod...	J.J. Abrams	Every generation has a story.	...	Thirty years after defeating the Galactic Empi...	136	Action\|Adventure\|Science Fiction\|Fantasy	Lucasfilm\|Truenorth Productions\|Bad Robot	12/15/15	5292	7.5	2015	1.839999e+08	1.902723e+09
4	168259	tt2820852	9.335014	190000000	1506249360	Furious 7	Vin Diesel\|Paul Walker\|Jason Statham\|Michelle ...	http://www.furious7.com/	James Wan	Vengeance Hits Home	...	Deckard Shaw seeks revenge against Dominic Tor...	137	Action\|Crime\|Thriller	Universal Pictures\|Original Film\|Media Rights ...	4/1/15	2947	7.3	2015	1.747999e+08	1.385749e+09
5	281957	tt1663202	9.110700	135000000	532950503	The Revenant	Leonardo DiCaprio\|Tom Hardy\|Will Poulter\|Domhn...	http://www.foxmovies.com/movies/the-revenant	Alejandro GonzÃ¡lez IÃ±Ã¡rritu	(n. One who has returned, as if from the dead.)	...	In the 1820s, a frontiersman, Hugh Glass, sets...	156	Western\|Drama\|Adventure\|Thriller	Regency Enterprises\|Appian Way\|CatchPlay\|Anony...	12/25/15	3929	7.2	2015	1.241999e+08	4.903142e+08
6	87101	tt1340138	8.654359	155000000	440603537	Terminator Genisys	Arnold Schwarzenegger\|Jason Clarke\|Emilia Clar...	http://www.terminatormovie.com/	Alan Taylor	Reset the future	...	The year is 2029. John Connor, leader of the r...	125	Science Fiction\|Action\|Thriller\|Adventure	Paramount Pictures\|Skydance Productions	6/23/15	2598	5.8	2015	1.425999e+08	4.053551e+08
7	286217	tt3659388	7.667400	108000000	595380321	The Martian	Matt Damon\|Jessica Chastain\|Kristen Wiig\|Jeff ...	http://www.foxmovies.com/movies/the-martian	Ridley Scott	Bring Him Home	...	During a manned mission to Mars, Astronaut Mar...	141	Drama\|Adventure\|Science Fiction	Twentieth Century Fox Film Corporation\|Scott F...	9/30/15	4572	7.6	2015	9.935996e+07	5.477497e+08
8	211672	tt2293640	7.404165	74000000	1156730962	Minions	Sandra Bullock\|Jon Hamm\|Michael Keaton\|Allison...	http://www.minionsmovie.com/	Kyle Balda\|Pierre Coffin	Before Gru, they had a history of bad bosses	...	Minions Stuart, Kevin and Bob are recruited by...	91	Family\|Animation\|Adventure\|Comedy	Universal Pictures\|Illumination Entertainment	6/17/15	2893	6.5	2015	6.807997e+07	1.064192e+09
9	150540	tt2096673	6.326804	175000000	853708609	Inside Out	Amy Poehler\|Phyllis Smith\|Richard Kind\|Bill Ha...	http://movies.disney.com/inside-out	Pete Docter	Meet the little voices inside your head.	...	Growing up can be a bumpy road, and it's no ex...	94	Comedy\|Animation\|Family	Walt Disney Pictures\|Pixar Animation Studios\|W...	6/9/15	3935	8.0	2015	1.609999e+08	7.854116e+08

	id	popularity	budget	revenue	director	runtime	genres	vote_count	vote_average	release_year
0	135397	33.000000	150000000	1.513529e+09	Colin Trevorrow	124	[Action, Adventure, Science Fiction, Thriller]	5562	6.500000	2015
1	76341	28.421875	150000000	3.784364e+08	George Miller	120	[Action, Adventure, Science Fiction, Thriller]	6185	7.101562	2015
2	262500	13.109375	110000000	2.952382e+08	Robert Schwentke	119	[Adventure, Science Fiction, Thriller]	2480	6.300781	2015
3	140607	11.171875	200000000	2.068178e+09	J.J. Abrams	136	[Action, Adventure, Science Fiction, Fantasy]	5292	7.500000	2015
4	168259	9.335938	190000000	1.506249e+09	James Wan	137	[Action, Crime, Thriller]	2947	7.300781	2015
...	...	...	...	...	...	...	...	...	...	...
95	258509	1.841797	30766902	2.337556e+08	Walt Becker	92	[Adventure, Animation, Comedy, Family]	278	5.699219	2015
96	298382	1.823242	11930000	1.834000e+07	Jocelyn Moorhouse	118	[Drama]	197	6.898438	2015
97	272693	1.758789	8500000	4.352863e+07	Ari Sandel	100	[Romance, Comedy]	753	6.800781	2015
98	283445	1.742188	10000000	5.288202e+07	Ciaran Foy	97	[Horror]	331	5.500000	2015
99	256961	1.735352	30000000	1.075972e+08	Andy Fickman	94	[Action, Adventure, Comedy, Family]	422	5.300781	2015

Project: Investigate a Dataset - TMDb Movie Data¶

Table of Contents¶

Introduction¶

Dataset Description¶

Questions for Analysis¶

Data Wrangling¶

Data Cleaning¶

Exploratory Data Analysis¶

Question 1: Does the runtime of a movie correlate with its popularity or revenue?¶

Answer:¶

In Summary:¶

Runtime Vs. Popularity:¶

Runtime vs. Revenue:¶

Further Investigation¶

Question 2: Which directors have the most high-revenue movies?¶

Answer:¶

Question 3: Which movies are most popular from year to year?¶

Answer:¶

Question 4: What kinds of properties are associated with movies that have high revenues?¶

Answer:¶

Popularity and Revenue:¶

Budget and Revenue:¶

Vote Count and Revenue:¶

Runtime and Revenue:¶

Vote Average and Revenue:¶

Conclusions¶

1- Does the runtime of a movie correlate with its popularity or revenue?¶

2- Which directors have the most high-revenue movies?¶

3- Which movies are most popular from year to year?¶

4- What kinds of properties are associated with movies that have high revenues?¶

Faced limitations:¶

Bias Toward More Recent Movies:¶

Limited Genre Information:¶

Further Investigation¶