import pandas as pd
pd.set_option("display.max_rows", 6)

DataFrame

pd.DataFrame({Column1: [Value1.1, Value2.1],
              Column2: [Value1.2, Value2.2]},
            index=["Row1", "Row2"])

DataFrame = table as Excel

pd.DataFrame({"Column_1": ["Value_1.1", "Value_2.1"],
              "Column_2": ["Value_1.2", "Value_2.2"]},
             index=["Row_1", "Row_2"])

pd.DataFrame({"Apples": [35, 41], 
              "Bananas": [21, 34]},
             index=["2017 Sales", "2018 Sales"])

Series

pd.Series([Value1, Value2, Value3],
          index=["Row1", "Row2", "Row3"],
          name="nameSerie")

Series = list

pd.Series(["Value_1", "Value_2", "Value_3"],
          index=["Row1", "Row2", "Row3"],
          name="nameSerie")

Row1    Value_1
Row2    Value_2
Row3    Value_3
Name: nameSerie, dtype: object

pd.Series(["4 cups", "1 cup", "2 large", "1 can"],
          index=["Flour", "Milk", "Eggs", "Spam"],
          name="Dinner")

Flour     4 cups
Milk       1 cup
Eggs     2 large
Spam       1 can
Name: Dinner, dtype: object

Reading

pd.read_csv("path", index_col=0)

Put the option "index_col=0" if you want to use the index inside the csv (in first column)

wines = pd.read_csv("./src/winemag-data-130k-v2.csv", index_col=0)

wines.shape

(129971, 13)

wines.head()

Index with iloc (numerical position)

data.iloc[row, column]

":" minds everithing

iloc uses the Python stdlib indexing scheme, where the first element of the range is included and the last one excluded.

Warning: with iloc : [0:10] => 0,...,9

wines.iloc[0]

country                                                    Italy
description    Aromas include tropical fruit, broom, brimston...
designation                                         Vulkà Bianco
                                     ...                        
title                          Nicosia 2013 Vulkà Bianco  (Etna)
variety                                              White Blend
winery                                                   Nicosia
Name: 0, Length: 13, dtype: object

wines.iloc[:, 0]

0            Italy
1         Portugal
2               US
            ...   
129968      France
129969      France
129970      France
Name: country, Length: 129971, dtype: object

wines.iloc[:3, 0]

0       Italy
1    Portugal
2          US
Name: country, dtype: object

wines.iloc[1:3, 0]

1    Portugal
2          US
Name: country, dtype: object

wines.iloc[[0, 1, 2], 0]

0       Italy
1    Portugal
2          US
Name: country, dtype: object

wines.iloc[-5:]

Index with loc (label_based position)

data.loc[row, column]

loc indexes inclusively. So 0:10 will select entries 0,...,10.

Warning: with loc : [0:10] => 0,...,10

wines.loc[:, "points"]

0         87
1         87
2         87
          ..
129968    90
129969    90
129970    90
Name: points, Length: 129971, dtype: int64

wines.loc[0:9,["country", "variety"]]

wines.loc[(wines.country == 'Italy') & (wines.points >= 90)]

wines.loc[(wines.country == 'Italy') | (wines.points >= 90)]

wines.loc[wines.country.isin(['Italy', 'France'])]

wines.loc[wines.price.notnull()]

wines.loc[wines.price.isnull()]

Summary functions

wines.points.describe()

count    129971.000000
mean         88.447138
std           3.039730
             ...      
50%          88.000000
75%          91.000000
max         100.000000
Name: points, Length: 8, dtype: float64

wines.taster_name.describe()

count         103727
unique            19
top       Roger Voss
freq           25514
Name: taster_name, dtype: object

wines.points.mean()

88.44713820775404

wines.points.median()

88.0

wines.taster_name.unique()

array(['Kerin O’Keefe', 'Roger Voss', 'Paul Gregutt',
       'Alexander Peartree', 'Michael Schachner', 'Anna Lee C. Iijima',
       'Virginie Boone', 'Matt Kettmann', nan, 'Sean P. Sullivan',
       'Jim Gordon', 'Joe Czerwinski', 'Anne Krebiehl\xa0MW',
       'Lauren Buzzeo', 'Mike DeSimone', 'Jeff Jenssen',
       'Susan Kostrzewa', 'Carrie Dykes', 'Fiona Adams',
       'Christina Pickard'], dtype=object)

wines.taster_name.value_counts()

Roger Voss           25514
Michael Schachner    15134
Kerin O’Keefe        10776
                     ...  
Carrie Dykes           139
Fiona Adams             27
Christina Pickard        6
Name: taster_name, Length: 19, dtype: int64

Maps

mySeries.map(lambda p: function(p))

myDataFrame.apply(nameFunctionOnRow, axis="columns")

myDataFrame.apply(nameFunctionOnColumns, axis="index") (by default)

Note: map() and apply() return new, transformed Series and DataFrames, respectively. They don’t modify the original data they’re called on.

review_points_mean = wines.points.mean()
wines.points.map(lambda p: p - review_points_mean)

0        -1.447138
1        -1.447138
2        -1.447138
            ...   
129968    1.552862
129969    1.552862
129970    1.552862
Name: points, Length: 129971, dtype: float64

review_points_mean = wines.points.mean()
wines.points - review_points_mean

0        -1.447138
1        -1.447138
2        -1.447138
            ...   
129968    1.552862
129969    1.552862
129970    1.552862
Name: points, Length: 129971, dtype: float64

def remean_points(row):
    row.points = row.points - review_points_mean
    return row

wines.apply(remean_points, axis='columns')

# index of the bet bargain
bidx = (wines.points / wines.price).idxmax()
bargain_wine = wines.title.iloc[bidx]
bargain_wine

'Bandit NV Merlot (California)'

def nb_stars(row):
    if row.country == "Canada":
        return 3
    elif row.points >= 95:
        return 3
    elif row.points >= 85:
        return 2
    else:
        return 1
        
star_ratings = wines.apply(nb_stars, axis="columns")

print(star_ratings)

0         2
1         2
2         2
         ..
129968    2
129969    2
129970    2
Length: 129971, dtype: int64

nb_tropical = sum(wines.description.map(lambda p: "tropical" in p))

nb_fruity = sum(wines.description.map(lambda p: "fruity" in p))

descriptor_counts = pd.Series([nb_tropical, 
                               nb_fruity], 
                              index=['tropical', 'fruity'])

print(descriptor_counts)

tropical    3607
fruity      9090
dtype: int64

GroupBy

wines.points.value_counts()

88     17207
87     16933
90     15410
       ...  
98        77
99        33
100       19
Name: points, Length: 21, dtype: int64

wines.groupby('points').points.count()

points
80      397
81      692
82     1836
       ... 
98       77
99       33
100      19
Name: points, Length: 21, dtype: int64

wines.groupby('points').price.min()

points
80      5.0
81      5.0
82      4.0
       ... 
98     50.0
99     44.0
100    80.0
Name: price, Length: 21, dtype: float64

wines.groupby('points').price.mean().round(1)

points
80      16.4
81      17.2
82      18.9
       ...  
98     245.5
99     284.2
100    485.9
Name: price, Length: 21, dtype: float64

wines.groupby('winery').apply(lambda df: df.title.iloc[0])

winery
1+1=3                                     1+1=3 NV Rosé Sparkling (Cava)
10 Knots                            10 Knots 2010 Viognier (Paso Robles)
100 Percent Wine              100 Percent Wine 2015 Moscato (California)
                                             ...                        
Ökonomierat Rebholz    Ökonomierat Rebholz 2007 Von Rotliegenden Spät...
àMaurice               àMaurice 2013 Fred Estate Syrah (Walla Walla V...
Štoka                                    Štoka 2009 Izbrani Teran (Kras)
Length: 16757, dtype: object

wines.groupby(['country', 'province']).apply(lambda df: df.loc[df.points.idxmax()])

wines.groupby(['country']).price.agg([len, min, max])

countries_reviewed = wines.groupby(['country', 'province']).description.agg([len])
countries_reviewed

mi = countries_reviewed.index
type(mi)

pandas.core.indexes.multi.MultiIndex

countries_reviewed.reset_index()

Sorting

myDataFrame.sort_values(by="nameColumn")  # ascending=True by default

myDataFrame.sort_values(by="nameColumn", ascending=False)

myDataFrame.sort_values(by=["nameColumn1", "nameColumn2"])

myDataFrame.sort_index()

countries_reviewed = countries_reviewed.reset_index()
countries_reviewed.sort_values(by='len')

countries_reviewed.sort_values(by='len', ascending=False)

countries_reviewed.sort_index()

countries_reviewed.sort_values(by=['country', 'len'])

Data Types

myDataFrameOrSeries.nameColumn.dtype

myDataFrameOrSeries.index.dtype

myDataFrameOrSeries.dtypes

myDataFrameOrSeries.nameColumn.astype("nameOfType")

wines

wines.index.dtype

dtype('int64')

wines.price.dtype

dtype('float64')

Missing Data

Important: Entries missing values are given the value NaN, short for "Not a Number". For technical reasons these NaN values are always of the float64 dtype.

---

myDataFrameOrSeries[pd.isnull(myDataFrameOrSeries.nameColumn)]

myDataFrameOrSeries[pd.notnull(myDataFrameOrSeries.nameColumn)]

myDataFrameOrSeries.nameColumn.fillna(value)

myDataFrameOrSeries.nameColumn.replace(oldValue, newValue)

wines[pd.isnull(wines.country)]

countryWines = wines.country

countryWines

0            Italy
1         Portugal
2               US
            ...   
129968      France
129969      France
129970      France
Name: country, Length: 129971, dtype: object

countryWines[pd.isnull]

913       NaN
3131      NaN
4243      NaN
         ... 
129408    NaN
129590    NaN
129900    NaN
Name: country, Length: 63, dtype: object

countryWines.isnull()

0         False
1         False
2         False
          ...  
129968    False
129969    False
129970    False
Name: country, Length: 129971, dtype: bool

countryWines = countryWines.fillna("Unknown")

countryWines[pd.isnull]

Series([], Name: country, dtype: object)

countryWines[913]

'Unknown'

countryWines = countryWines.replace("Unknown", "Invalid")

countryWines[913]

'Invalid'

Renaming

myDataFrame.rename(columns={"oldColumnName": "newColumnName"})

myDataFrame.rename(index={oldValueIndex: newValueIndex})

myDataFrame.rename_axis("nameOfIndex", axis='rows').rename_axis("nameOfFields", axis='columns')

winesTest = wines
winesTest.head()

winesTest = winesTest.rename(columns={"points": "score"}).head()

winesTest.head()

winesTest.rename_axis("wineIndex", axis="rows")

winesTest.rename_axis("wineIndex", axis="rows").rename_axis("fields", axis="columns")

Combining

When two files have the same fields (columns):

pd.concat([myDataFrameOrSeries1, myDataFrameOrSeries2])

When two files have the same index but not the same fields (columns):

myDataFrameOrSeriesLeft.join(myDataFrameOrSeriesRight, lsuffix="nameLeft", rsuffix="nameRight")

	country	description	designation	points	price	province	region_1	region_2	taster_name	taster_twitter_handle	title	variety	winery
0	Italy	Aromas include tropical fruit, broom, brimston...	Vulkà Bianco	87	NaN	Sicily & Sardinia	Etna	NaN	Kerin O’Keefe	@kerinokeefe	Nicosia 2013 Vulkà Bianco (Etna)	White Blend	Nicosia
1	Portugal	This is ripe and fruity, a wine that is smooth...	Avidagos	87	15.0	Douro	NaN	NaN	Roger Voss	@vossroger	Quinta dos Avidagos 2011 Avidagos Red (Douro)	Portuguese Red	Quinta dos Avidagos
2	US	Tart and snappy, the flavors of lime flesh and...	NaN	87	14.0	Oregon	Willamette Valley	Willamette Valley	Paul Gregutt	@paulgwine	Rainstorm 2013 Pinot Gris (Willamette Valley)	Pinot Gris	Rainstorm
3	US	Pineapple rind, lemon pith and orange blossom ...	Reserve Late Harvest	87	13.0	Michigan	Lake Michigan Shore	NaN	Alexander Peartree	NaN	St. Julian 2013 Reserve Late Harvest Riesling ...	Riesling	St. Julian
4	US	Much like the regular bottling from 2012, this...	Vintner's Reserve Wild Child Block	87	65.0	Oregon	Willamette Valley	Willamette Valley	Paul Gregutt	@paulgwine	Sweet Cheeks 2012 Vintner's Reserve Wild Child...	Pinot Noir	Sweet Cheeks

	country	description	designation	points	price	province	region_1	region_2	taster_name	taster_twitter_handle	title	variety	winery
129966	Germany	Notes of honeysuckle and cantaloupe sweeten th...	Brauneberger Juffer-Sonnenuhr Spätlese	90	28.0	Mosel	NaN	NaN	Anna Lee C. Iijima	NaN	Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...	Riesling	Dr. H. Thanisch (Erben Müller-Burggraef)
129967	US	Citation is given as much as a decade of bottl...	NaN	90	75.0	Oregon	Oregon	Oregon Other	Paul Gregutt	@paulgwine	Citation 2004 Pinot Noir (Oregon)	Pinot Noir	Citation
129968	France	Well-drained gravel soil gives this wine its c...	Kritt	90	30.0	Alsace	Alsace	NaN	Roger Voss	@vossroger	Domaine Gresser 2013 Kritt Gewurztraminer (Als...	Gewürztraminer	Domaine Gresser
129969	France	A dry style of Pinot Gris, this is crisp with ...	NaN	90	32.0	Alsace	Alsace	NaN	Roger Voss	@vossroger	Domaine Marcel Deiss 2012 Pinot Gris (Alsace)	Pinot Gris	Domaine Marcel Deiss
129970	France	Big, rich and off-dry, this is powered by inte...	Lieu-dit Harth Cuvée Caroline	90	21.0	Alsace	Alsace	NaN	Roger Voss	@vossroger	Domaine Schoffit 2012 Lieu-dit Harth Cuvée Car...	Gewürztraminer	Domaine Schoffit

	country	description	designation	points	price	province	region_1	region_2	taster_name	taster_twitter_handle	title	variety	winery
120	Italy	Slightly backward, particularly given the vint...	Bricco Rocche Prapó	92	70.0	Piedmont	Barolo	NaN	NaN	NaN	Ceretto 2003 Bricco Rocche Prapó (Barolo)	Nebbiolo	Ceretto
130	Italy	At the first it was quite muted and subdued, b...	Bricco Rocche Brunate	91	70.0	Piedmont	Barolo	NaN	NaN	NaN	Ceretto 2003 Bricco Rocche Brunate (Barolo)	Nebbiolo	Ceretto
133	Italy	Einaudi's wines have been improving lately, an...	NaN	91	68.0	Piedmont	Barolo	NaN	NaN	NaN	Poderi Luigi Einaudi 2003 Barolo	Nebbiolo	Poderi Luigi Einaudi
...	...	...	...	...	...	...	...	...	...	...	...	...	...
129947	Italy	A blend of 65% Cabernet Sauvignon, 30% Merlot ...	Symposio	90	20.0	Sicily & Sardinia	Terre Siciliane	NaN	Kerin O’Keefe	@kerinokeefe	Feudo Principi di Butera 2012 Symposio Red (Te...	Red Blend	Feudo Principi di Butera
129961	Italy	Intense aromas of wild cherry, baking spice, t...	NaN	90	30.0	Sicily & Sardinia	Sicilia	NaN	Kerin O’Keefe	@kerinokeefe	COS 2013 Frappato (Sicilia)	Frappato	COS
129962	Italy	Blackberry, cassis, grilled herb and toasted a...	Sàgana Tenuta San Giacomo	90	40.0	Sicily & Sardinia	Sicilia	NaN	Kerin O’Keefe	@kerinokeefe	Cusumano 2012 Sàgana Tenuta San Giacomo Nero d...	Nero d'Avola	Cusumano

		country	description	designation	points	price	province	region_1	region_2	taster_name	taster_twitter_handle	title	variety	winery
country	province
Argentina	Mendoza Province	Argentina	If the color doesn't tell the full story, the ...	Nicasia Vineyard	97	120.0	Mendoza Province	Mendoza	NaN	Michael Schachner	@wineschach	Bodega Catena Zapata 2006 Nicasia Vineyard Mal...	Malbec	Bodega Catena Zapata
Argentina	Other	Argentina	Take note, this could be the best wine Colomé ...	Reserva	95	90.0	Other	Salta	NaN	Michael Schachner	@wineschach	Colomé 2010 Reserva Malbec (Salta)	Malbec	Colomé
Armenia	Armenia	Armenia	Deep salmon in color, this wine offers a bouqu...	Estate Bottled	88	15.0	Armenia	NaN	NaN	Mike DeSimone	@worldwineguys	Van Ardi 2015 Estate Bottled Rosé (Armenia)	Rosé	Van Ardi
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
Uruguay	Progreso	Uruguay	Rusty in color but deep and complex in nature,...	Etxe Oneko Fortified Sweet Red	90	46.0	Progreso	NaN	NaN	Michael Schachner	@wineschach	Pisano 2007 Etxe Oneko Fortified Sweet Red Tan...	Tannat	Pisano
	San Jose	Uruguay	Baked, sweet, heavy aromas turn earthy with ti...	El Preciado Gran Reserva	87	50.0	San Jose	NaN	NaN	Michael Schachner	@wineschach	Castillo Viejo 2005 El Preciado Gran Reserva R...	Red Blend	Castillo Viejo
	Uruguay	Uruguay	Cherry and berry aromas are ripe, healthy and ...	Blend 002 Limited Edition	91	22.0	Uruguay	NaN	NaN	Michael Schachner	@wineschach	Narbona NV Blend 002 Limited Edition Tannat-Ca...	Tannat-Cabernet Franc	Narbona

	len	min	max
country
Argentina	3800.0	4.0	230.0
Armenia	2.0	14.0	15.0
Australia	2329.0	5.0	850.0
...	...	...	...
US	54504.0	4.0	2013.0
Ukraine	14.0	6.0	13.0
Uruguay	109.0	10.0	130.0

	country	province	len
0	Argentina	Mendoza Province	3264
1	Argentina	Other	536
2	Armenia	Armenia	2
...	...	...	...
422	Uruguay	Progreso	11
423	Uruguay	San Jose	3
424	Uruguay	Uruguay	24

	country	province	len
179	Greece	Muscat of Kefallonian	1
192	Greece	Sterea Ellada	1
194	Greece	Thraki	1
...	...	...	...
118	France	Bordeaux	5941
415	US	Washington	8639
392	US	California	36247

	country	description	designation	points	price	province	region_1	region_2	taster_name	taster_twitter_handle	title	variety	winery
913	NaN	Amber in color, this wine has aromas of peach ...	Asureti Valley	87	30.0	NaN	NaN	NaN	Mike DeSimone	@worldwineguys	Gotsa Family Wines 2014 Asureti Valley Chinuri	Chinuri	Gotsa Family Wines
3131	NaN	Soft, fruity and juicy, this is a pleasant, si...	Partager	83	NaN	NaN	NaN	NaN	Roger Voss	@vossroger	Barton & Guestier NV Partager Red	Red Blend	Barton & Guestier
4243	NaN	Violet-red in color, this semisweet wine has a...	Red Naturally Semi-Sweet	88	18.0	NaN	NaN	NaN	Mike DeSimone	@worldwineguys	Kakhetia Traditional Winemaking 2012 Red Natur...	Ojaleshi	Kakhetia Traditional Winemaking
...	...	...	...	...	...	...	...	...	...	...	...	...	...
129408	NaN	El Capricho is one of Uruguay's more consisten...	Reserve	89	22.0	NaN	NaN	NaN	Michael Schachner	@wineschach	El Capricho 2015 Reserve Tempranillo	Tempranillo	El Capricho
129590	NaN	A blend of 60% Syrah, 30% Cabernet Sauvignon a...	Shah	90	30.0	NaN	NaN	NaN	Mike DeSimone	@worldwineguys	Büyülübağ 2012 Shah Red	Red Blend	Büyülübağ
129900	NaN	This wine offers a delightful bouquet of black...	NaN	91	32.0	NaN	NaN	NaN	Mike DeSimone	@worldwineguys	Psagot 2014 Merlot	Merlot	Psagot

	country	description	designation	score	price	province	region_1	region_2	taster_name	taster_twitter_handle	title	variety	winery
wineIndex
0	Italy	Aromas include tropical fruit, broom, brimston...	Vulkà Bianco	87	NaN	Sicily & Sardinia	Etna	NaN	Kerin O’Keefe	@kerinokeefe	Nicosia 2013 Vulkà Bianco (Etna)	White Blend	Nicosia
1	Portugal	This is ripe and fruity, a wine that is smooth...	Avidagos	87	15.0	Douro	NaN	NaN	Roger Voss	@vossroger	Quinta dos Avidagos 2011 Avidagos Red (Douro)	Portuguese Red	Quinta dos Avidagos
2	US	Tart and snappy, the flavors of lime flesh and...	NaN	87	14.0	Oregon	Willamette Valley	Willamette Valley	Paul Gregutt	@paulgwine	Rainstorm 2013 Pinot Gris (Willamette Valley)	Pinot Gris	Rainstorm
3	US	Pineapple rind, lemon pith and orange blossom ...	Reserve Late Harvest	87	13.0	Michigan	Lake Michigan Shore	NaN	Alexander Peartree	NaN	St. Julian 2013 Reserve Late Harvest Riesling ...	Riesling	St. Julian
4	US	Much like the regular bottling from 2012, this...	Vintner's Reserve Wild Child Block	87	65.0	Oregon	Willamette Valley	Willamette Valley	Paul Gregutt	@paulgwine	Sweet Cheeks 2012 Vintner's Reserve Wild Child...	Pinot Noir	Sweet Cheeks

	Column_1	Column_2
Row_1	Value_1.1	Value_1.2
Row_2	Value_2.1	Value_2.2

	Apples	Bananas
2017 Sales	35	21
2018 Sales	41	34