Financial Time Series in Python using Pandas




high_frequency








In [1]:
######################################################
#    Financial Time Series in Python using Pandas    #
######################################################

# This time we will cover

# 1) Series in Pandas
# 2) Dataframe in Pandas
# 3) An example with more Pandas operations and ols regression

import numpy as np
import pandas as pd
In [2]:
#======================================================
# 1) Series in Pandas
#======================================================
# A series is a one dimensional data container, like list, array
# Each item has a labeled index from 0. 

# First, let's create a Series
s = pd.Series([7, 'Heisenberg', 3.14, -1789710578, 'Happy Eating!'])
s
Out[2]:
0                7
1       Heisenberg
2             3.14
3      -1789710578
4    Happy Eating!
dtype: object
In [3]:
# Second, assign index for each item
s = pd.Series([7, 'Heisenberg', 3.14, -1789710578, 'Happy Eating!'],
              index=['A', 'Z', 'C', 'Y', 'E'])
s
Out[3]:
A                7
Z       Heisenberg
C             3.14
Y      -1789710578
E    Happy Eating!
dtype: object
In [4]:
# Third, use Series to create a dictionary, use Key as index
d = {'Chicago': 1000, 'New York': 1300, 'Portland': 900, 'San Francisco': 1100,
     'Austin': 450, 'Boston': None}
cities = pd.Series(d)
cities
Out[4]:
Austin            450.0
Boston              NaN
Chicago          1000.0
New York         1300.0
Portland          900.0
San Francisco    1100.0
dtype: float64
In [5]:
# use Keys/Indexes to select multiple times from Series
cities[['Chicago', 'Portland']]
Out[5]:
Chicago     1000.0
Portland     900.0
dtype: float64
In [6]:
# use Boolean to filter 
cities[cities < 1000]
Out[6]:
Austin      450.0
Portland    900.0
dtype: float64
In [7]:
# check if certain key is contained in the dictionary
print('Seattle' in cities)
print('San Francisco' in cities)
False
True
In [8]:
# Mathematical operations can be done using scalars and functions.
# Notice, these operations won't change Series itself. 

# square city values
cities ** 2
Out[8]:
Austin            202500.0
Boston                 NaN
Chicago          1000000.0
New York         1690000.0
Portland          810000.0
San Francisco    1210000.0
dtype: float64
In [9]:
# square city values
np.square(cities)
Out[9]:
Austin            202500.0
Boston                 NaN
Chicago          1000000.0
New York         1690000.0
Portland          810000.0
San Francisco    1210000.0
dtype: float64
In [10]:
#======================================================
# 2) Dataframe in Pandas
#======================================================

# Dataframe is a tablular data sctructure comprised of rows and columns
# Or you can think Dataframe as a group of Series objects
df = pd.DataFrame([10,20,30,40,50], columns = ['numbers'], 
                 index = ['a','b','c','d','e'])
df
Out[10]:
numbers
a 10
b 20
c 30
d 40
e 50
In [11]:
#select multiple indices
df.ix[['a', 'd']]
Out[11]:
numbers
a 10
d 40
In [12]:
#select via index object
df.ix[df.index[1:3]]
Out[12]:
numbers
b 20
c 30
In [13]:
#sum per column
df.sum()
Out[13]:
numbers    150
dtype: int64
In [14]:
#apply lambda function 
df.apply(lambda x: x ** 2)
Out[14]:
numbers
a 100
b 400
c 900
d 1600
e 2500
In [15]:
# add new column by indices are aligned automatically 
df['name'] = pd.DataFrame(['Yves','Guido','Felix','Francesc','Max'],index=['d','a','b','e','c'])
df
Out[15]:
numbers name
a 10 Guido
b 20 Felix
c 30 Max
d 40 Yves
e 50 Francesc
In [16]:
# append a DataFrame object (add a new row), providing appropriate index information
df = df.append(pd.DataFrame({'numbers':60, 'name':'John'}, index=['f']))
df
Out[16]:
name numbers
a Guido 10
b Felix 20
c Max 30
d Yves 40
e Francesc 50
f John 60
In [17]:
# convert numpy.ndarry to pandas dataframe
a = np.random.standard_normal((9,4))
a.round(6)
df = pd.DataFrame(a)
df
Out[17]:
0 1 2 3
0 1.315131 -0.069730 -0.211395 1.001188
1 1.772644 -0.505542 -2.036797 1.322028
2 0.114443 -1.763574 0.929363 0.384832
3 1.153564 1.399280 0.123830 -0.884408
4 -1.258273 -0.532170 0.557115 -0.607815
5 -0.333240 1.888825 -1.079276 0.466121
6 -2.066903 -0.537521 -2.060987 0.632245
7 -0.731049 -1.368871 0.757279 -0.643934
8 -0.170428 1.890354 -0.006337 1.695278
In [18]:
# add column titles and indice names
df.columns = [['No1','No2','No3','No4']]
dates = pd.date_range('2015-1-1', periods = 9, freq ='M')
df.index = dates
df
Out[18]:
No1 No2 No3 No4
2015-01-31 1.315131 -0.069730 -0.211395 1.001188
2015-02-28 1.772644 -0.505542 -2.036797 1.322028
2015-03-31 0.114443 -1.763574 0.929363 0.384832
2015-04-30 1.153564 1.399280 0.123830 -0.884408
2015-05-31 -1.258273 -0.532170 0.557115 -0.607815
2015-06-30 -0.333240 1.888825 -1.079276 0.466121
2015-07-31 -2.066903 -0.537521 -2.060987 0.632245
2015-08-31 -0.731049 -1.368871 0.757279 -0.643934
2015-09-30 -0.170428 1.890354 -0.006337 1.695278
In [21]:
# Clipboard

# One very handy tool for writing data into python using pandas
# After you copy your data to clipboard
data = pd.read_clipboard()
data
Out[21]:
1 2 3
A 0.684295 0.810579 0.238690
B 0.573430 0.300304 0.593980
C 0.312708 0.251962 0.771901
D 0.072734 0.538026 0.769084
E 0.598087 0.498119 0.588915
F 0.518453 0.839630 0.930421
G 0.043837 0.964559 0.217262
H 0.313692 0.282979 0.104801
In [22]:
#===========================================================
# Part 3. One practical example 
# with pandas.regression
# pandas.dataframe.resample
#===========================================================
# Intraday tick data could be huge, here we import an intraday 
# best market bid & ask prices for USD/JPY and EUR/ on Apr 04 2016
#
# Here, we just simply compare the bid ask spread (market width)
# with average price of bid and ask prices

USDJPY = 'USDJPY-2016-04.csv'
EURUSD = 'EURUSD-2016-04.csv'
path = 'tickdata/'

td1 = pd.read_csv(path+USDJPY)
td2 = pd.read_csv(path+EURUSD)

td1.columns = ('product', 'time in GMT', 'bid price', 'ask price')
td2.columns = ('product', 'time in GMT', 'bid price', 'ask price')
In [23]:
td1.head()
Out[23]:
product time in GMT bid price ask price
0 USD/JPY 20160401 00:00:00.127 112.516998 112.524002
1 USD/JPY 20160401 00:00:00.306 112.518997 112.524002
2 USD/JPY 20160401 00:00:00.383 112.518997 112.525002
3 USD/JPY 20160401 00:00:00.687 112.518997 112.525002
4 USD/JPY 20160401 00:00:00.792 112.519997 112.526001
In [24]:
td2.head()
Out[24]:
product time in GMT bid price ask price
0 EUR/USD 20160401 00:00:00.217 1.13785 1.13789
1 EUR/USD 20160401 00:00:00.384 1.13783 1.13788
2 EUR/USD 20160401 00:00:00.689 1.13783 1.13788
3 EUR/USD 20160401 00:00:00.872 1.13783 1.13788
4 EUR/USD 20160401 00:00:01.047 1.13779 1.13781
In [25]:
# Powerfull dataframe resampling. 
# You can specify what data interval you want
# Here we use 'T' for minute, '5T' means five minutes
# You can find more from 
#http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
td1['Datetime'] = pd.to_datetime(td1['time in GMT'])
td1['mid price'] = (td1['bid price'] + td1['ask price'])*0.5 
td1 = td1.set_index(pd.DatetimeIndex(td1['Datetime']))
td1_resam = td1.resample('5T').mean()
td1_resam.head()
Out[25]:
bid price ask price mid price
2016-04-01 00:00:00 112.445046 112.450213 112.447629
2016-04-01 00:05:00 112.385987 112.391507 112.388747
2016-04-01 00:10:00 112.343787 112.349277 112.346532
2016-04-01 00:15:00 112.337256 112.342172 112.339714
2016-04-01 00:20:00 112.322586 112.327850 112.325218
In [26]:
td2['Datetime'] = pd.to_datetime(td2['time in GMT'])
td2['mid price'] = (td2['bid price'] + td2['ask price'])*0.5 
td2 = td2.set_index(pd.DatetimeIndex(td2['Datetime']))
td2_resam = td2.resample('5T').mean()
td2_resam.head()
Out[26]:
bid price ask price mid price
2016-04-01 00:00:00 1.138016 1.138061 1.138039
2016-04-01 00:05:00 1.138384 1.138437 1.138410
2016-04-01 00:10:00 1.138579 1.138628 1.138604
2016-04-01 00:15:00 1.138395 1.138445 1.138420
2016-04-01 00:20:00 1.138350 1.138397 1.138373
In [27]:
import matplotlib.pyplot as plt
%matplotlib inline  
#
df = pd.DataFrame()
df['USD-JPY mid price'] = td1_resam['mid price']
df['EUR-USD mid price'] = td2_resam['mid price']
x = np.arange(0, len(df.index), 1)

fig, ax1 = plt.subplots(figsize=(15, 8))

ax2 = ax1.twinx()
ax1.plot(x, df['USD-JPY mid price'], 'g-')
ax2.plot(x, df['EUR-USD mid price'], 'b-')

ax1.set_xlabel(' # in every five minutes', fontsize=20)
ax1.set_ylabel('USD-JPY mid price', color='g', fontsize=20)
ax2.set_ylabel('EUR-USD mid price', color='b', fontsize=20)
Out[27]:
<matplotlib.text.Text at 0x7efe87643828>
In [28]:
# Next step, we want to find out how correlated for
# USD-JPY vs EUD-USD, if USD-JPY is between 108 and 110 
model = pd.ols(y=df['EUR-USD mid price'], x=df['USD-JPY mid price'])
plt.plot(df['USD-JPY mid price'], df['EUR-USD mid price'], 'r.')
ax = plt.axis() #grab axis values
x = np.linspace(ax[0], ax[1]+0.01)
plt.plot(x, model.beta[1] + model.beta[0]*x, 'b', lw=2)
plt.grid(True)
plt.axis('tight')
plt.xlabel('USD-JPY Mid Price')
plt.ylabel('EUR-USD Mid Price')
plt.show()
/home/maximshen/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2885: FutureWarning: The pandas.stats.ols module is deprecated and will be removed in a future version. We refer to external packages like statsmodels, see some examples here: http://statsmodels.sourceforge.net/stable/regression.html
  exec(code_obj, self.user_global_ns, self.user_ns)
In [29]:
len(df.index) # how many five minute intervals are included
Out[29]:
8316
In [30]:
model
Out[30]:
-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <x> + <intercept>

Number of Observations:         5870
Number of Degrees of Freedom:   2

R-squared:         0.0977
Adj R-squared:     0.0975

Rmse:              0.0051

F-stat (1, 5868):   635.3313, p-value:     0.0000

Degrees of Freedom: model 1, resid 5868

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
             x    -0.0013     0.0001     -25.21     0.0000    -0.0014    -0.0012
     intercept     1.2741     0.0056     229.12     0.0000     1.2632     1.2850
---------------------------------End of Summary---------------------------------
In [31]:
# From the chart above, 
# it looks like there is a relative higher correlation
# if USD-JPY is between 108 and 110, let's zoom in a little bit 

# apply two filters for 'USD-JPY mid price'
df = df[(df['USD-JPY mid price'] >=108) & (df['USD-JPY mid price'] <=110)]

model = pd.ols(y=df['EUR-USD mid price'], x=df['USD-JPY mid price'])
plt.plot(df['USD-JPY mid price'], df['EUR-USD mid price'], 'r.')
ax = plt.axis() #grab axis values
x = np.linspace(ax[0], ax[1]+0.01)
plt.plot(x, model.beta[1] + model.beta[0]*x, 'b', lw=2)
plt.grid(True)
plt.axis('tight')
plt.xlabel('USD-JPY Mid Price')
plt.ylabel('EUR-USD Mid Price')
plt.show()
/home/maximshen/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2885: FutureWarning: The pandas.stats.ols module is deprecated and will be removed in a future version. We refer to external packages like statsmodels, see some examples here: http://statsmodels.sourceforge.net/stable/regression.html
  exec(code_obj, self.user_global_ns, self.user_ns)
In [32]:
len(df.index) # how many five minute intervals are included
Out[32]:
3284
In [33]:
model
Out[33]:
-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <x> + <intercept>

Number of Observations:         3284
Number of Degrees of Freedom:   2

R-squared:         0.2286
Adj R-squared:     0.2284

Rmse:              0.0043

F-stat (1, 3282):   972.6776, p-value:     0.0000

Degrees of Freedom: model 1, resid 3282

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
             x    -0.0045     0.0001     -31.19     0.0000    -0.0048    -0.0042
     intercept     1.6230     0.0157     103.45     0.0000     1.5923     1.6538
---------------------------------End of Summary---------------------------------


Leave a Reply