# Location of data file
linkFile="https://github.com/eScienceWinterSchool/PythonSession/raw/master/data/HDI_Table.xlsx"


# available in my computer?
!pip show openpyxl


import pandas as pd

hdiFile=pd.read_excel(linkFile) # only for excel


hdiFile


!pip show html5lib beautifulsoup4


#path
linkwiki='https://en.wikipedia.org/wiki/The_Economist_Democracy_Index'


# call
sortableTables=pd.read_html(io=linkwiki,# this is the link to main webpage
                            flavor='bs4',# you want pandas to use bs4
                            attrs={"class": "wikitable sortable"}) # an attribute of the table to scrape


# what you got, and many you got
type(sortableTables), len(sortableTables)


# let's see the second one
sortableTables[1].head(10)


# let's see the third one
sortableTables[2].head(10)


# this the one
demoTable=sortableTables[2].copy()


hdiFile.iloc[[3,4],:]


# here
hdiFile.iloc[3,2:]


# and here
hdiFile.iloc[4,:2]


# save column names 
RealHeaders=hdiFile.iloc[4,:2]+hdiFile.iloc[3,2:]
# these are:
RealHeaders


# save column names turned to lists
RealHeaders=hdiFile.iloc[4,:2].to_list()+hdiFile.iloc[3,2:].to_list()
# these are:
RealHeaders


# rename all the columns
hdiFile.columns=RealHeaders

# newDF
better_1=hdiFile.copy()

# see head
better_1.head()


# without the last 4 columns
better_1.iloc[:,:-4]


# then,
better_1=better_1.iloc[:,:-4]


better_1.columns


#like this?
better_1.columns.dropna()


# make the change!

BetterHeaders=better_1.columns.dropna()
#result
BetterHeaders


#subsetting again

better_1=better_1.loc[:,BetterHeaders]
better_2=better_1.copy()
#see
better_2.head(20)


# bye anything between parentheses
better_2.columns.str.replace('\(.+\)',"", regex=True)


# bye anything between parentheses, bye leading-trailing spaces
better_2.columns.str.replace('\(.+\)',"", regex=True).str.strip()


# bye anything between parentheses, bye leading-trailing spaces, title case
better_2.columns.str.replace('\(.+\)',"", regex=True).\
                          str.strip().\
                          str.title()


#changing column names
better_2.columns=better_2.columns.str.replace('\(.+\)',"", regex=True).\
                          str.strip().\
                          str.title()
#so
better_2


better_2.columns.str.replace(" ",'',regex=False)


# each column names splitted:
[name.split() for name in better_2.columns[2::]]


# first letter of each word
[[word[0] for word in name.split()] for name in better_2.columns[2::]]


# final result
[''.join([word[0] for word in name.split()]) for name in better_2.columns[2::]]


newNames=[''.join([word[0] for word in name.split()]) for name in better_2.columns[2::]]
better_2.columns=better_2.columns[:2].str.replace(" ",'',regex=False).to_list()+newNames

#newDF
better_3=better_2.copy()


better_3.head(20)


better_3[~pd.isna(better_3['HdiRank'])]


# then
hdiSubset=better_3[~pd.isna(better_3['HdiRank'])].copy()

#see
hdiSubset


hdiSubset.drop(index=4)


hdiSubset.drop(index=4, inplace=True)
hdiSubset.reset_index(drop=True, inplace=True)
#see
hdiSubset


for i in range(hdiSubset.shape[1]):
    try:
        print(hdiSubset.iloc[:,i][hdiSubset.iloc[:,i].str.fullmatch("\W+",na=False)])
    except:
        pass


# replacing !

badSymbols=["..",'xx','tba']
hdiSubset.replace(to_replace=badSymbols,
               value=None,
               inplace=True)

#result
hdiSubset


# with all missing (after the first column)
hdiSubset[hdiSubset.iloc[:,1:].isna().all(axis=1)]


# with at least one missing (after the first column)
hdiSubset[hdiSubset.iloc[:,1:].isna().any(axis=1)]


hdiClean=hdiSubset.copy()


# explore
hdiClean.info()


hdiClean.columns.drop('Country')


# keep all numeric columns

allNumCols=hdiClean.columns.drop('Country')
allNumCols


# as easy as:
hdiClean[allNumCols]=hdiClean[allNumCols].apply(pd.to_numeric)
hdiFormat=hdiClean.copy()


#recheck
hdiFormat.info()


# can you apply math?
hdiFormat.drop(columns=['Country'], axis=0).max()


# brief look
demoTable


#data types
demoTable.info()


# keep some:
someCols=demoTable.columns[~demoTable.columns.str.contains('Δ')]


#subset
demoTable=demoTable[someCols].copy()

demoTable.columns=demoTable.columns.str.replace('\W',"",regex=True)

#then
demoTable


#rewrite the levels in ascending order:
correctLevels=['Authoritarian', 'Hybrid regime', 'Flawed democracy','Full democracy']
#format as ordinal:
demoTable.Regimetype=pd.Categorical(demoTable.Regimetype,categories=correctLevels,ordered=True)
demoFormat=demoTable.copy()


demoFormat.info()


# left_on= / right_on NOT NEEDED (only when column names differ)
HdiDemo=hdiFormat.merge(demoFormat,left_on='Country', right_on='Country')
HdiDemo


len(HdiDemo),len(hdiFormat),len(demoFormat)


onlyHDI=set(hdiFormat.Country)-set(demoFormat.Country)
onlyDEMO=set(demoFormat.Country)-set(hdiFormat.Country)


onlyHDI


onlyDEMO


from thefuzz import process as fz

# take a country from onlyDEMO

# and get the country that matches the most in OnlyHDI, show the match score!

# notice I sorted onlyDEMO

[(aDemoCountry,fz.extractOne(aDemoCountry, onlyHDI)) for aDemoCountry in sorted(onlyDEMO)]


notInHDI=['North Korea','Taiwan']
demoFormat_sub=demoFormat[~demoFormat.Country.isin(notInHDI)]


# dictionary of changes
changesDEMO={'Czech Republic':'Czechia',
             'Laos':"Lao People's Democratic Republic"}

# make the replacement
demoFormat_sub.Country.replace(to_replace=changesDEMO,inplace=True)


onlyHDI=set(hdiFormat.Country)-set(demoFormat_sub.Country)
onlyDEMO=set(demoFormat_sub.Country)-set(hdiFormat.Country)

[(aDemoCountry,fz.extractOne(aDemoCountry, onlyHDI)) for aDemoCountry in sorted(onlyDEMO)]


changesDEMO={aDemoCountry:fz.extractOne(aDemoCountry, onlyHDI)[0] for aDemoCountry in sorted(onlyDEMO)}
changesDEMO


# replace in democracy

demoFormat_sub.Country.replace(to_replace=changesDEMO,inplace=True)


# did you get more rows?
HdiDemo_2=hdiFormat.merge(demoFormat_sub)


# lenghts

len(HdiDemo_2),len(HdiDemo), len(hdiFormat),len(demoFormat_sub),len(demoFormat)


HdiDemo_2.info()


HdiDemo_2.describe().loc[['min','max']].T #T for transposing


import matplotlib.pyplot as plt

HdiDemo_2.plot(kind='box', rot=90,fontsize=5)
plt.semilogy();


import numpy as np

colsToScale = HdiDemo_2.select_dtypes([np.number]).columns

colsToScale


from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df_minmax = scaler.fit_transform(HdiDemo_2.loc[:,colsToScale].to_numpy())
df_scaled = pd.DataFrame(df_minmax, columns=colsToScale)


df_scaled.describe().loc[['min','max']].T


df_scaled.plot(kind='box', rot=90,fontsize=5);


df_scaled.columns=df_scaled.columns+"_mM"


# concat to the right (instead of bottom) with axis=1
pd.concat([HdiDemo_2,df_scaled],axis=1)


hdidem_plus=pd.concat([HdiDemo_2,df_scaled],axis=1)
hdidem_plus.head()


hdidem_plus.to_pickle("hdidem_plus.pkl")
# you will need: DF=pd.read_pickle("hdidem_plus.pkl")
# or:
# from urllib.request import urlopen
# DF=pd.read_pickle(urlopen("https://...../hdidem_plus.pkl"),compression=None)


!pip show rpy2


from rpy2.robjects import pandas2ri
pandas2ri.activate()

from rpy2.robjects.packages import importr

base = importr('base')
base.saveRDS(hdidem_plus,file="hdidem_plus.RDS")

#In R, you call it with: DF = readRDS("hdidem_plus.RDS")
#or, if read from cloud: DF = readRDS(url("https://...../hdidem_plus.RDS")

Prof. José Manuel Magallanes, PhD¶

Using Python for Pre Processing¶

1. Collect data tables into Python¶

1.1 Upload a File (Human Development Index)¶

1.2 Scrape a Table (The Economist Democracy Index)¶

2. Cleaning¶

Fix column names¶

Subset data¶

Missing values¶

Formatting DFs¶

3. Integrating¶

Basic merging¶

Fuzzy Merge¶

Prepare file for further work¶

Scaling¶

Exporting¶

For future use in Python:¶

For future use in R:¶