I have recently done a small project in a financial institution. The Oracle SQL query has became so complex and took very long query execution time due to the schema design and other restriction of the Oracle database configuration. Therefore, I used the python Pandas to do the ETL instead.
Below is just a simplified version of the original Python script for the project, but the thought process should be similar in general:
import cx_Oracle
import pandas as pd
from pandas import DataFrame
con = cx_Oracle.connect(‘user/pass@host”)
cur = con.cursor()
rows = cur.execute(“select * from table”)
result = rows.fetchall()
result[10]
df = DataFrame(result, columns = [‘a’, ‘b’, ‘c’])
/# alternatively use below pandas method to import sql query result directly into dataframe #/
sql1 = “SELECT * from table ” [Note: if the sql query is very long, use “”” to wrap the sql to avoid syntax error.]
df1 = pd.read_sql(sql1, con)
sql2 = “””SELECT * from table “””
df2 = pd.read_sql(sql2, con)
sql3 = “””SELECT * from table “””
df3 = pd.read_sql(sql3, con)
sql4 = “””SELECT * from table “””
df4 = pd.read_sql(sql4, con)
sql5 = “””SELECT * from table “””
df5 = pd.read_sql(sql5, con)
wip2 = pd.merge(txn_df, price_df, how= ‘outer’, left_on =[‘stock_ID’], right_on =[‘stock_ID’])
wip2[‘PERIOD’] = wip2.DATA_MONTH.dt.to_period(‘M’) – wip2.TRADE_MONTH.dt.to_period(‘M’)
wip2 = wip2[(wip2.PERIOD =-18)]
pd.set_option(‘display.max_columns’, None)
wip3 = pd.merge(wip2, df3, how =’left’, left_on = [‘Month’,’CODE’,’ID’], right_on =[‘MONTH’,’code’,’ID’])
wip4 = pd.merge(wip3, df4, how =’left’, left_on = [‘Month’,’code’], right_on =[‘MONTH’,’code’])
wip5 = pd.merge(wip4, df5, how =’left’, left_on = [‘MONTH2′,’Code’], right_on =[‘MONTH’,’code’])
wip5 = wip5.drop(wip5.columns[38], axis=1)
wip5 = wip5.rename(columns={‘old_column_name’: ‘new_column_name’})
df = df.rename(columns={“””TRUNC(DATA_DATE,’MM’)”””: ‘DATA_DATE’})
wip5 = pd.merge(wip4, df5, how =’left’, left_on = [‘Date’,’CODE’], right_on =[‘DATE’,’code’])
wip5[‘BUY_BASE’] = wip5.BUY_AMT / wip5.BUY_QTN
wip5[‘SELL_BASE’] = wip5.SELL_AMT / wip5.SELL_QTN
conditions = [
(wip5[‘TRANSACTION_TYPES’] == ‘BUY’) & (wip5[‘TRADED_QUANTITY’] == wip5[‘HOLDING_QUANTITY’]),
(wip5[‘TRANSACTION_TYPES’] == ‘SELL’) & (wip5[‘TRADED_QUANTITY’] == wip5[‘HOLDING_QUANTITY’]),
(wip5[‘TRANSACTION_TYPES’] == ‘BUY’) & (wip5[‘TRADED_QUANTITY’] wip5[‘HOLDING_QUANTITY’]),
(wip5[‘TRANSACTION_TYPES’] == ‘SELL’) & (wip5[‘TRADED_QUANTITY’] wip5[‘HOLDING_QUANTITY’])]
choices = [‘X_BUY’, ‘X_SELL’, ‘X_x’, ‘X_t”]
wip5[‘ACTION’] = np.select(conditions, choices)
wip5[‘BUY_RETURN’] = (((wip5.PRICE * wip5.MONTHLY_EXCHANGE_RATE) – (wip5.BUY_BASE*wip5.BASE_EXCHANGE_RATE))/ (wip5.BUY_BASE*wip5.BASE_EXCHANGE_RATE))
wip5[‘SELL_RETURN’] = (((wip5.PRICE * wip5.MONTHLY_EXCHANGE_RATE) – (wip5.SELL_BASE*wip5.BASE_EXCHANGE_RATE))/ (wip5.SELL_BASE*wip5.BASE_EXCHANGE_RATE))
wip5.loc[:,’BUY_BASE’:’SELL_RETURN’]
exchange_df.EXCHANGE_MONTH.value_counts().sort_index()
wip5.groupby([‘PORTFOLIO_CODE’,’ACTION’,’TRADE_MONTH’,’DATA_MONTH’,’PERIOD’])[‘BUY_RETURN’,’SELL_RETURN’].mean()
FINAL=wip5.groupby([‘PORTFOLIO_CODE’,’ACTION’,’TRADE_MONTH’,’DATA_MONTH’,’PERIOD’])[‘BUY_RETURN’,’SELL_RETURN’].mean()
FINAL.to_csv(‘result.csv’, sep=’\t’)
DATA = wip5.loc[wip5[‘code’] == ‘xxxxx’]
DATA.to_csv(‘xxxxx_DATA.csv’,sep=’\t’)