Reminder on Panda (from Kaggle)
A list of definitions and memory aids for data and functions
- DataFrame
- Series
- Reading
- Index with iloc (numerical position)
- Index with loc (label_based position)
- Summary functions
- Maps
- GroupBy
- Sorting
- Data Types
- Missing Data
- Renaming
- Combining
import pandas as pd
pd.set_option("display.max_rows", 6)
pd.DataFrame({"Column_1": ["Value_1.1", "Value_2.1"],
"Column_2": ["Value_1.2", "Value_2.2"]},
index=["Row_1", "Row_2"])
pd.DataFrame({"Apples": [35, 41],
"Bananas": [21, 34]},
index=["2017 Sales", "2018 Sales"])
pd.Series(["Value_1", "Value_2", "Value_3"],
index=["Row1", "Row2", "Row3"],
name="nameSerie")
pd.Series(["4 cups", "1 cup", "2 large", "1 can"],
index=["Flour", "Milk", "Eggs", "Spam"],
name="Dinner")
wines = pd.read_csv("./src/winemag-data-130k-v2.csv", index_col=0)
wines.shape
wines.head()
wines.iloc[0]
wines.iloc[:, 0]
wines.iloc[:3, 0]
wines.iloc[1:3, 0]
wines.iloc[[0, 1, 2], 0]
wines.iloc[-5:]
wines.loc[:, "points"]
wines.loc[0:9,["country", "variety"]]
wines.loc[(wines.country == 'Italy') & (wines.points >= 90)]
wines.loc[(wines.country == 'Italy') | (wines.points >= 90)]
wines.loc[wines.country.isin(['Italy', 'France'])]
wines.loc[wines.price.notnull()]
wines.loc[wines.price.isnull()]
wines.points.describe()
wines.taster_name.describe()
wines.points.mean()
wines.points.median()
wines.taster_name.unique()
wines.taster_name.value_counts()
Maps
mySeries.map(lambda p: function(p))
myDataFrame.apply(nameFunctionOnRow, axis="columns")
myDataFrame.apply(nameFunctionOnColumns, axis="index") (by default)
Note: map() and apply() return new, transformed Series and DataFrames, respectively. They don’t modify the original data they’re called on.
review_points_mean = wines.points.mean()
wines.points.map(lambda p: p - review_points_mean)
review_points_mean = wines.points.mean()
wines.points - review_points_mean
def remean_points(row):
row.points = row.points - review_points_mean
return row
wines.apply(remean_points, axis='columns')
# index of the bet bargain
bidx = (wines.points / wines.price).idxmax()
bargain_wine = wines.title.iloc[bidx]
bargain_wine
def nb_stars(row):
if row.country == "Canada":
return 3
elif row.points >= 95:
return 3
elif row.points >= 85:
return 2
else:
return 1
star_ratings = wines.apply(nb_stars, axis="columns")
print(star_ratings)
nb_tropical = sum(wines.description.map(lambda p: "tropical" in p))
nb_fruity = sum(wines.description.map(lambda p: "fruity" in p))
descriptor_counts = pd.Series([nb_tropical,
nb_fruity],
index=['tropical', 'fruity'])
print(descriptor_counts)
wines.points.value_counts()
wines.groupby('points').points.count()
wines.groupby('points').price.min()
wines.groupby('points').price.mean().round(1)
wines.groupby('winery').apply(lambda df: df.title.iloc[0])
wines.groupby(['country', 'province']).apply(lambda df: df.loc[df.points.idxmax()])
wines.groupby(['country']).price.agg([len, min, max])
countries_reviewed = wines.groupby(['country', 'province']).description.agg([len])
countries_reviewed
mi = countries_reviewed.index
type(mi)
countries_reviewed.reset_index()
countries_reviewed = countries_reviewed.reset_index()
countries_reviewed.sort_values(by='len')
countries_reviewed.sort_values(by='len', ascending=False)
countries_reviewed.sort_index()
countries_reviewed.sort_values(by=['country', 'len'])
wines
wines.index.dtype
wines.price.dtype
Missing Data
Important: Entries missing values are given the value NaN, short for "Not a Number". For technical reasons these NaN values are always of the float64 dtype.
---
myDataFrameOrSeries[pd.isnull(myDataFrameOrSeries.nameColumn)]
myDataFrameOrSeries[pd.notnull(myDataFrameOrSeries.nameColumn)]
myDataFrameOrSeries.nameColumn.fillna(value)
myDataFrameOrSeries.nameColumn.replace(oldValue, newValue)
wines[pd.isnull(wines.country)]
countryWines = wines.country
countryWines
countryWines[pd.isnull]
countryWines.isnull()
countryWines = countryWines.fillna("Unknown")
countryWines[pd.isnull]
countryWines[913]
countryWines = countryWines.replace("Unknown", "Invalid")
countryWines[913]
winesTest = wines
winesTest.head()
winesTest = winesTest.rename(columns={"points": "score"}).head()
winesTest.head()
winesTest.rename_axis("wineIndex", axis="rows")
winesTest.rename_axis("wineIndex", axis="rows").rename_axis("fields", axis="columns")