import numpy as np
import pandas as pd
pd.set_option("display.precision", 2)

# We will explore and answer some questions about the dataset, using the main methods in pandas.

# Film dataset:

df = pd.read_csv('Film.csv',index_col=0)
df

# First, we will take a look at data dimensionality:

print(df.shape)

(100, 9)

# From the output, we can see that the table contains 100 rows and 9 columns.

# We will show basic statistical characteristics:

df.describe()

# Showing statistics on non-numerical features:

df.describe(include=['object'])

df.groupby('Rating')['Time'].median()

Rating
1.0     86.0
1.5     81.5
2.0     89.5
2.5     93.0
3.0    101.0
3.5     98.5
4.0     45.0
Name: Time, dtype: float64

# The above code gives us an idea about the median duration of time of the
# films in this dataset, and how it's related to the Rating. For example, we
# appreciate that the maximun rating correspond to a film that is 45 minutes long.

# Since the Rating feature in this dataset is related to the quality of the film, we will convert the Good feature 
# column type into a boolean with the astype method.
# This way will be more clear when a film is categorized as Good(False) or not(True).

df['Good'] = df['Good'].astype('bool')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 1 to 100
Data columns (total 9 columns):
Title          100 non-null object
Year           100 non-null int64
Time           100 non-null int64
Cast           100 non-null int64
Rating         100 non-null float64
Description    100 non-null int64
Origin         100 non-null int64
Time_code      100 non-null object
Good           100 non-null bool
dtypes: bool(1), float64(1), int64(5), object(2)
memory usage: 6.3+ KB
None

# We can see the Good feature column type converted into a boolean value:

df.head()

pd.crosstab(df['Good'], df['Rating'],margins=True)

#Visualizing the crosstab:

# some imports to set up plotting
import matplotlib.pyplot as plt
# pip install seaborn
import seaborn as sns
# Graphics in retina format are more sharp and legible
%config InlineBackend.figure_format = 'retina'

sns.countplot(x='Good', hue='Rating', data=df);

#Answering some questions:

#How many short and long films are represented in this dataset?

df['Time_code'].value_counts()

long     58
short    42
Name: Time_code, dtype: int64

# We have 58 long films and 42 short films.

# What is the average rating of long films?

df[df['Time_code']=='long']['Rating'].mean()

2.5086206896551726

# The average rating of long films is 2.5 approximately.

# What is the percentage of films made in 1971?

float((df['Year'] == 1971).sum()) / df.shape[0]

0.04

# 4 percent of the films in this dataset were made in 1971

# What are the mean and standard deviation of ratings of the films?

column_to_show = ['Rating']
ratings = df.groupby(['Time_code'])[column_to_show].agg([np.mean, np.std])

ratings

	Title	Year	Time	Cast	Rating	Description	Origin	Time_code	Good
1	A_Ticklish_Affair	1963	89	5	2.0	7	0	short	0
2	Action_in_the_North_Atlantic	1943	127	7	3.0	9	0	long	1
3	And_the_Ship_Sails_On	1984	138	7	3.0	15	3	long	1
4	Autumn_Sonata	1978	97	5	3.0	11	5	long	1
5	Bachelor_Apartment	1931	77	6	2.5	7	0	short	0
6	Benson_Murder_Case	1930	69	8	2.5	10	0	short	0
7	Black_Hand	1950	93	5	3.0	8	0	long	1
8	Blaze	1989	119	8	2.5	15	0	long	0
9	Blondie_Has_Servant_Trouble	1940	70	9	2.5	8	0	short	0
10	Blondie_in_the_Dough	1947	69	9	2.0	8	0	short	0
11	Brewster_McCloud	1970	101	9	3.0	11	0	long	1
12	Calling_Philo_Vance	1940	62	6	2.0	10	0	short	0
13	Car_Wash	1976	97	10	2.5	12	0	long	0
14	City_Lights	1985	85	10	1.0	13	0	short	0
15	Come_Out_Fighting	1945	62	9	1.5	9	0	short	0
16	Conflict	1945	86	6	2.5	7	0	short	0
17	Conquest	1937	112	10	3.0	10	0	long	1
18	Dakota	1988	97	6	2.0	11	0	long	0
19	Deadhead_Miles	1972	93	12	2.5	11	0	long	0
20	Divided_Heart	1954	89	7	3.0	8	1	short	1
21	Evergreen	1934	90	5	3.0	9	1	long	1
22	Falcon_Strikes_Back	1943	66	9	2.5	9	0	short	0
23	Find_the_Lady	1976	79	6	1.5	13	1	short	0
24	Five_Golden_Hours	1961	90	7	2.0	9	1	long	0
25	Flash_and_the_Firecat	1975	84	6	1.5	7	0	short	0
26	Flight	1929	116	6	2.5	7	0	long	0
27	Four_Jills_in_a_Jeep	1944	89	12	2.5	12	0	short	0
28	Galileo	1973	145	11	3.0	13	1	long	1
29	Hambone_and_Hillie	1984	89	8	2.5	8	0	short	0
30	Hitler--Dead_or_Alive	1943	70	7	2.0	6	0	short	0
...	...	...	...	...	...	...	...	...	...
71	The_Judge_and_the_Assassin	1975	130	5	3.5	9	2	long	1
72	The_Last_Valley	1970	128	6	2.0	9	1	long	0
73	The_Marriage_of_a_Young_Stockbroker	1971	95	6	2.5	14	0	long	0
74	The_Miracle_Worker	1962	107	7	3.5	14	0	long	1
75	The_Mutineers	1949	60	4	1.5	5	0	short	0
76	The_Raven	1963	86	6	3.0	9	0	short	1
77	The_Ravine	1969	97	6	2.0	7	3	long	0
78	The_Revolt_of_Job	1983	97	6	3.5	9	6	long	1
79	The_Romantic_Age	1949	86	6	2.0	7	1	short	0
80	The_Siege_at_Red_River	1954	81	5	2.5	5	0	short	0
81	The_Stone_Boy	1984	93	8	3.5	12	0	long	1
82	The_Strip	1951	85	4	2.0	9	0	short	0
83	The_Surrogate	1984	95	7	2.5	13	4	long	0
84	The_Twinkle_in_God's_Eye	1955	73	5	2.0	7	0	short	0
85	The_Ultimate_Warrior	1975	94	6	2.5	10	0	long	0
86	The_Unholy_Three	1930	72	6	2.5	9	0	short	0
87	The_Well	1951	85	7	3.0	8	0	short	1
88	Tom_Dick_and_Harry	1941	86	7	3.5	11	0	short	1
89	Triumph_of_the_Spirit	1989	121	7	3.0	12	0	long	1
90	Uncle_Moses	1932	87	7	2.5	16	0	short	0
91	Unsane	1982	100	6	2.0	12	3	long	0
92	Valley_of_Gwangi	1969	95	5	2.5	11	0	long	0
93	Valley_of_the_Dragons	1961	79	5	1.5	8	0	short	0
94	Vicki	1953	85	7	2.5	9	0	short	0
95	Volere_Volare	1991	92	7	2.5	17	3	long	0
96	Warning_Shot	1967	100	13	3.5	12	0	long	1
97	Whispering_Smith_vs._Scot._Yard	1951	77	5	2.0	9	1	short	0
98	Windows	1980	96	4	1.0	9	0	long	0
99	Windwalker	1980	108	5	2.5	11	0	long	0
100	You_Only_Live_Twice	1967	116	9	2.5	14	1	long	0

	Year	Time	Cast	Rating	Description	Origin	Good
count	100.00	100.00	100.00	100.00	100.00	100.00	100.00
mean	1963.64	92.87	6.75	2.33	10.02	0.48	0.31
std	18.13	18.26	1.97	0.70	2.87	1.10	0.46
min	1924.00	45.00	3.00	1.00	5.00	0.00	0.00
25%	1949.00	81.00	5.00	2.00	8.00	0.00	0.00
50%	1966.00	93.00	6.00	2.50	9.50	0.00	0.00
75%	1978.50	101.25	8.00	3.00	12.00	0.00	1.00
max	1995.00	145.00	13.00	4.00	21.00	6.00	1.00

Search This Blog

curious_about_data

Exploring a Film Dataset.

Comments

Post a Comment