Exploring a Film Dataset.
In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.precision", 2)
In [2]:
# We will explore and answer some questions about the dataset, using the main methods in pandas.
In [3]:
# Film dataset:
In [4]:
df = pd.read_csv('Film.csv',index_col=0)
df
Out[4]:
In [5]:
# First, we will take a look at data dimensionality:
In [6]:
print(df.shape)
In [7]:
# From the output, we can see that the table contains 100 rows and 9 columns.
In [8]:
# We will show basic statistical characteristics:
In [9]:
df.describe()
Out[9]:
In [10]:
# Showing statistics on non-numerical features:
In [11]:
df.describe(include=['object'])
Out[11]:
In [12]:
df.groupby('Rating')['Time'].median()
Out[12]:
In [13]:
# The above code gives us an idea about the median duration of time of the
# films in this dataset, and how it's related to the Rating. For example, we
# appreciate that the maximun rating correspond to a film that is 45 minutes long.
In [14]:
# Since the Rating feature in this dataset is related to the quality of the film, we will convert the Good feature
# column type into a boolean with the astype method.
# This way will be more clear when a film is categorized as Good(False) or not(True).
In [15]:
df['Good'] = df['Good'].astype('bool')
In [16]:
print(df.info())
In [17]:
# We can see the Good feature column type converted into a boolean value:
In [18]:
df.head()
Out[18]:
In [36]:
pd.crosstab(df['Good'], df['Rating'],margins=True)
Out[36]:
In [20]:
#Visualizing the crosstab:
In [37]:
# some imports to set up plotting
import matplotlib.pyplot as plt
# pip install seaborn
import seaborn as sns
# Graphics in retina format are more sharp and legible
%config InlineBackend.figure_format = 'retina'
In [38]:
sns.countplot(x='Good', hue='Rating', data=df);
In [22]:
#Answering some questions:
In [23]:
#How many short and long films are represented in this dataset?
In [24]:
df['Time_code'].value_counts()
Out[24]:
In [25]:
# We have 58 long films and 42 short films.
In [26]:
# What is the average rating of long films?
In [27]:
df[df['Time_code']=='long']['Rating'].mean()
Out[27]:
In [28]:
# The average rating of long films is 2.5 approximately.
In [29]:
# What is the percentage of films made in 1971?
In [30]:
float((df['Year'] == 1971).sum()) / df.shape[0]
Out[30]:
In [31]:
# 4 percent of the films in this dataset were made in 1971
In [32]:
# What are the mean and standard deviation of ratings of the films?
In [33]:
column_to_show = ['Rating']
ratings = df.groupby(['Time_code'])[column_to_show].agg([np.mean, np.std])
In [34]:
ratings
Out[34]:
Comments
Post a Comment