Data Processing Examples
Read in a .txt file:
df_i_header_file = OutputDir+fileName
df_i_header = pd.read_csv(df_i_header_file, sep=r'\t', engine='python',header=None)
Rename column names:
df_i_header.columns = [['Category', 'Description']]
df_Vstar = df_Vstar.rename(columns={x:x+'_FAH_comp' for x in df_Vstar.columns if x not in ['ROW']})
df_Head_t_full['Index'] = df_Head_t_full['Index'].astype(str).apply(lambda x:x.zfill(5))
- Rename (remove last few characters) except:
AFS_basic_ag_dom_agg = AFS_basic_ag_dom_agg.rename(columns= lambda x: str(x)[:-4] if x!='Sector' else x)
Rename column names by location:
df_outlay.columns.values[1]='PCE_basic'
Use one column as column names for a dataframe:
df_L.columns = df_T_header["ROW"]
Use two columns as a dictionary to rename:
Sort by columns:
sorted_df = df.sort_values(by=['Column1', 'Column2'], ascending=[True, False])
Reorder columns by column location index:
df_i_header = df_i_header.iloc[:,[0,2,1]]
Move one column to the beginning:
df_L_long.insert(0,'YEAR',df_L_long.pop("YEAR"))
Drop duplicates
df_lut_ctry = df_lut_ctry.drop_duplicates()
Drop a column if exists:
df_bal_3_noROW = df_bal_3.drop('ROW', axis=1, errors='ignore')
Create subsets:
df_lut_ctry = df_lut_ctry.iloc[:,[1,0]]
df_lut_ctry = df_lut_ctry.iloc[:-1,:]
Remove last column:
xfood_full = xfood_full.iloc[:,:-1]
Keep rows using values in another dataframe:
df_cord_1 = df_cord[df_cord.Country.isin(df_cor_1.Country)]
Create a new column based on the values in an existing column:
def fn_FdAbb(df_temp,colName):
conditions = [(df_temp[colName] =='Household final consumption P.3h'),
(df_temp[colName] == 'Non-profit institutions serving households P.3n'),
(df_temp[colName] == 'Government final consumption P.3g'),
(df_temp[colName] == 'Gross fixed capital formation P.51'),
(df_temp[colName] == 'Changes in inventories P.52'),
(df_temp[colName] == 'Acquisitions less disposals of valuables P.53')]
results = ["XH","XNPISH","XG","XK01","XK02","XK03"]
colNew = colName+"Abb"
df_temp[colNew] = np.select(conditions,results)
return df_temp
Create a new column by concatenating string columns
df_i_header['ctry_col_abb'] = df_i_header['Abb',].astype(str) +"_"+df_i_header['DescriptionAbb',]
Change values based on conditions:
df_Head_t_full.Ctry_Index = np.where(df_Head_t_full.Ctry_Index=='ROW_14839','ZZZ_Discrepancies', df_Head_t_full.Ctry_Index)
Create a column name in a loop:
colNew = colName+"Abb"
df_temp[colNew] = np.select(conditions,results)
Loop through all columnes except one:
for col in df_cou.columns.drop('Row_ctry'):
# print(col)
df_i = df_cou[['ROW', col]]
Convert wide to long:
df_FD_long = df_FD.melt(id_vars=["ROW"],
var_name="COL",
value_name="VALUE")
Reshape:
df_row = df_i.iloc[nth_row,1:].values.reshape(189, n_cols)
Create subsets:
select every columns except the first one:
df_trd_wl.iloc[:,1:]
select everything except the last row and the first column:
xfood_trd_dom_agg_T.iloc[:-1,1]
Different sums:
- create a row which is the total of all other rows by column:
df.loc['tot']=df.sum()
- Count by group:
df_cord['sum'] = df_cord.groupby('Country')['Unnamed: 3'].transform('count')
- Sum all columns except the first two columns:
df_bal_2_test['sum'] = df_bal_2_test.iloc[:,2:].sum(axis=1)
- Sum by rows and by columns:
row_sums = df_bal_3.sum(axis=1)
col_sums = df_bal_3.sum(axis=0)
combined_df = pd.DataFrame({'Row Sums': row_sums, 'Column Sums': col_sums})
Convert index to a column:
df_bal_3 = df_bal_3.reset_index()
remove last row:
df_trd_wl = df_trd_wl[:-1]
xfood_bp_dom.iloc[:-1,1:]
multiply one column to all other columns:
df_trd_rt = df_trd_wl.iloc[:,1:].multiply(xfood_trd_dom_agg_T.iloc[:-1,1])
Replace some rows by another dataset:
df_Xfood_1.update(df_Xfood_trans)
Horizontally concatenate dataframes:
df_cord_full = pd.concat([df_Head_t_full, df_cord.iloc[:,4:]], axis=1)
Horizontally multiple merge dataframes:
dfs = [df_gh, df_imp, df_exp, df_eDValue_Open_P, df_eDValue_GH_P, df_eDValue_imp_P, df_eDValue_exp_P, df_eExpPr]
df = df_open
for df_i in dfs:
df = pd.merge(df, df_i, on = ['F','N','D'], how='outer')
Export to excel spreadsheets:
writer = pd.ExcelWriter(OutputDir+"/JY.xlsx", engine='xlsxwriter')
df_lut_ctry.to_excel(writer,sheet_name = "Country")
df_t_header_out.to_excel(writer,sheet_name="Industry")
writer.close()
bcp load to sql:
command=r"""bcp [db].[schema].[table] in D:\...\To_sql_row.csv -c -t"|" -F 2 """ +config.bcpConfig
subprocess.run(command)
Output to csv:
df_L_long.to_csv(OutputDir + "/ToSQL_L_Long.csv", sep="|", index=False)