import pandas as pd
pd.__version__
import matplotlib.pyplot as plt
%matplotlib inline
ted = pd.read_csv('ted.csv')
# each row represents a single talk
ted.head()
ted.info()
# rows, columns
ted.shape
# object columns are usually strings, but can also be arbitrary Python objects (lists, dictionaries)
ted.dtypes
# count the number of missing values in each column
ted.isna().sum()
# sort by the number of first-level comments, though this is biased in favor of older talks
ted.sort_values('comments').tail()
# correct for this bias by calculating the number of comments per view
ted['comments_per_view'] = ted.comments / ted.views
# interpretation: for every view of the same-sex marriage talk, there are 0.002 comments
ted.sort_values('comments_per_view').tail()
# make this more interpretable by inverting the calculation
ted['views_per_comment'] = ted.views / ted.comments
# interpretation: 1 out of every 450 people leave a comment
ted.sort_values('views_per_comment').head()
Lessons:
# line plot is not appropriate here (use it to measure something over time)
ted.comments.plot()
# histogram shows the frequency distribution of a single numeric variable
ted.comments.plot(kind='hist')
# modify the plot to be more informative
ted[ted.comments < 1000].comments.plot(kind='hist')
# check how many observations we removed from the plot
ted[ted.comments >= 1000].shape
# can also write this using the query method
ted.query('comments < 1000').comments.plot(kind='hist')
# can also write this using the loc accessor
ted.loc[ted.comments < 1000, 'comments'].plot(kind='hist')
# increase the number of bins to see more detail
ted.loc[ted.comments < 1000, 'comments'].plot(kind='hist', bins=20)
# boxplot can also show distributions, but it's far less useful for concentrated distributions because of outliers
ted.loc[ted.comments < 1000, 'comments'].plot(kind='box')
Lessons:
Bonus exercise: calculate the average delay between filming and publishing
# event column does not always include the year
ted.event.sample(10)
# dataset documentation for film_date says "Unix timestamp of the filming"
ted.film_date.head()
# results don't look right
pd.to_datetime(ted.film_date).head()
# now the results look right
pd.to_datetime(ted.film_date, unit='s').head()
ted['film_datetime'] = pd.to_datetime(ted.film_date, unit='s')
# verify that event name matches film_datetime for a random sample
ted[['event', 'film_datetime']].sample(5)
# new column uses the datetime data type (this was an automatic conversion)
ted.dtypes
# datetime columns have convenient attributes under the dt namespace
ted.film_datetime.dt.year.head()
# similar to string methods under the str namespace
ted.event.str.lower().head()
# count the number of talks each year using value_counts()
ted.film_datetime.dt.year.value_counts()
# points are plotted and connected in the order you give them to pandas
ted.film_datetime.dt.year.value_counts().plot()
# need to sort the index before plotting
ted.film_datetime.dt.year.value_counts().sort_index().plot()
# we only have partial data for 2017
ted.film_datetime.max()
Lessons:
# count the number of talks (great if you value variety, but they may not be great talks)
ted.event.value_counts().head()
# use views as a proxy for "quality of talk"
ted.groupby('event').views.mean().head()
# find the largest values, but we don't know how many talks are being averaged
ted.groupby('event').views.mean().sort_values().tail()
# show the number of talks along with the mean (events with the highest means had only 1 or 2 talks)
ted.groupby('event').views.agg(['count', 'mean']).sort_values('mean').tail()
# calculate the total views per event
ted.groupby('event').views.agg(['count', 'mean', 'sum']).sort_values('sum').tail()
Lessons:
# previously, users could tag talks on the TED website (funny, inspiring, confusing, etc.)
ted.ratings.head()
# two ways to examine the ratings data for the first talk
ted.loc[0, 'ratings']
ted.ratings[0]
# this is a string not a list
type(ted.ratings[0])
# convert this into something useful using Python's ast module (Abstract Syntax Tree)
import ast
# literal_eval() allows you to evaluate a string containing a Python literal or container
ast.literal_eval('[1, 2, 3]')
# if you have a string representation of something, you can retrieve what it actually represents
type(ast.literal_eval('[1, 2, 3]'))
# unpack the ratings data for the first talk
ast.literal_eval(ted.ratings[0])
# now we have a list (of dictionaries)
type(ast.literal_eval(ted.ratings[0]))
# define a function to convert an element in the ratings Series from string to list
def str_to_list(ratings_str):
return ast.literal_eval(ratings_str)
# test the function
str_to_list(ted.ratings[0])
# Series apply method applies a function to every element in a Series and returns a Series
ted.ratings.apply(str_to_list).head()
# lambda is a shorter alternative
ted.ratings.apply(lambda x: ast.literal_eval(x)).head()
# an even shorter alternative is to apply the function directly (without lambda)
ted.ratings.apply(ast.literal_eval).head()
ted['ratings_list'] = ted.ratings.apply(lambda x: ast.literal_eval(x))
# check that the new Series looks as expected
ted.ratings_list[0]
# each element in the Series is a list
type(ted.ratings_list[0])
# data type of the new Series is object
ted.ratings_list.dtype
# object is not just for strings
ted.dtypes
Lessons:
Bonus exercises:
# expected result (for each talk) is sum of count
ted.ratings_list[0]
# start by building a simple function
def get_num_ratings(list_of_dicts):
return list_of_dicts[0]
# pass it a list, and it returns the first element in the list, which is a dictionary
get_num_ratings(ted.ratings_list[0])
# modify the function to return the vote count
def get_num_ratings(list_of_dicts):
return list_of_dicts[0]['count']
# pass it a list, and it returns a value from the first dictionary in the list
get_num_ratings(ted.ratings_list[0])
# modify the function to get the sum of count
def get_num_ratings(list_of_dicts):
num = 0
for d in list_of_dicts:
num = num + d['count']
return num
# looks about right
get_num_ratings(ted.ratings_list[0])
# check with another record
ted.ratings_list[1]
# looks about right
get_num_ratings(ted.ratings_list[1])
# apply it to every element in the Series
ted.ratings_list.apply(get_num_ratings).head()
# another alternative is to use a generator expression
sum((d['count'] for d in ted.ratings_list[0]))
# use lambda to apply this method
ted.ratings_list.apply(lambda x: sum((d['count'] for d in x))).head()
# another alternative is to use pd.DataFrame()
pd.DataFrame(ted.ratings_list[0])['count'].sum()
# use lambda to apply this method
ted.ratings_list.apply(lambda x: pd.DataFrame(x)['count'].sum()).head()
ted['num_ratings'] = ted.ratings_list.apply(get_num_ratings)
# do one more check
ted.num_ratings.describe()
Lessons:
Bonus exercises:
# "Funny" is not always the first dictionary in the list
ted.ratings_list.head()
# check ratings (not ratings_list) to see if "Funny" is always a rating type
ted.ratings.str.contains('Funny').value_counts()
# write a custom function
def get_funny_ratings(list_of_dicts):
for d in list_of_dicts:
if d['name'] == 'Funny':
return d['count']
# examine a record in which "Funny" is not the first dictionary
ted.ratings_list[3]
# check that the function works
get_funny_ratings(ted.ratings_list[3])
# apply it to every element in the Series
ted['funny_ratings'] = ted.ratings_list.apply(get_funny_ratings)
ted.funny_ratings.head()
# check for missing values
ted.funny_ratings.isna().sum()
ted['funny_rate'] = ted.funny_ratings / ted.num_ratings
# "gut check" that this calculation makes sense by examining the occupations of the funniest talks
ted.sort_values('funny_rate').speaker_occupation.tail(20)
# examine the occupations of the least funny talks
ted.sort_values('funny_rate').speaker_occupation.head(20)
# calculate the mean funny rate for each occupation
ted.groupby('speaker_occupation').funny_rate.mean().sort_values().tail()
# however, most of the occupations have a sample size of 1
ted.speaker_occupation.describe()
# count how many times each occupation appears
ted.speaker_occupation.value_counts()
# value_counts() outputs a pandas Series, thus we can use pandas to manipulate the output
occupation_counts = ted.speaker_occupation.value_counts()
type(occupation_counts)
# show occupations which appear at least 5 times
occupation_counts[occupation_counts >= 5]
# save the index of this Series
top_occupations = occupation_counts[occupation_counts >= 5].index
top_occupations
# filter DataFrame to include only those occupations
ted_top_occupations = ted[ted.speaker_occupation.isin(top_occupations)]
ted_top_occupations.shape
# redo the previous groupby
ted_top_occupations.groupby('speaker_occupation').funny_rate.mean().sort_values()
Lessons: