I/O operations for different data formats – using Python 3.X, Numpy, Pandas

Under construction …




io_operations








In [1]:
# By Maxim Shen June 17th 2016
#
# Python becomes more and more popular these days. Therefore, I try to
# illustrate some topics covering python I/O involving different data 
# formats, with the help of Numpy & Pandas

#########################################
#           From numpy to SQL           #
#########################################
import numpy as np
import pandas as pd
import sqlite3 as sq3
In [2]:
data = np.random.standard_normal((1000000, 5)).round(5)
path = 'databases/'
filename = path + 'numbs'

query1 = 'DROP TABLE IF EXISTS numbers' 
query2 = 'CREATE table numbers(No1 real, No2 real, No3 real, No4 real, No5 real)'
con = sq3.Connection(filename + '.db')
con.execute(query1)
con.execute(query2)
Out[2]:
<sqlite3.Cursor at 0x7f4abf2d6030>
In [3]:
%%time
con.executemany('insert into numbers values(?, ?, ?, ?, ?)', data)
con.commit()
CPU times: user 12.3 s, sys: 67.4 ms, total: 12.4 s
Wall time: 13.1 s
In [4]:
 %%time
query = 'select * from numbers where No1 > 0 and No2 < 0'
res = np.array(con.execute(query).fetchall()).round(3)
CPU times: user 562 ms, sys: 80 ms, total: 642 ms
Wall time: 645 ms
In [5]:
res = res[::100] # every 100th result
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(res[:, 0], res[:, 1], 'bo')
plt.grid(True); plt.xlim(-0.5, 4.5); plt.ylim(-4.5, 0.5)
Out[5]:
(-4.5, 0.5)
In [6]:
#########################################
#           From SQL to pandas          #
#########################################
import pandas.io.sql as pds
%time data = pds.read_sql('select * from numbers', con)
CPU times: user 1.86 s, sys: 124 ms, total: 1.98 s
Wall time: 1.99 s
In [7]:
data.head()
Out[7]:
No1 No2 No3 No4 No5
0 -0.32917 -0.93414 0.95442 0.85528 2.03799
1 -0.79488 -1.89925 -0.30668 -0.13543 -1.31810
2 -0.28278 0.85467 -0.21848 0.93119 1.52507
3 0.37585 0.60111 0.75688 -0.78798 0.48340
4 -1.52212 0.36855 1.46930 -1.15853 0.00134
In [8]:
%time data[(data['No1'] > 0) & (data['No2'] < 0)].head()
CPU times: user 24.8 ms, sys: 12 ms, total: 36.8 ms
Wall time: 36.5 ms
Out[8]:
No1 No2 No3 No4 No5
6 1.01389 -0.93966 -2.07765 1.23825 1.45321
7 0.18734 -0.46182 1.26406 -0.24496 -0.52947
10 1.51683 -1.90560 -0.92492 -0.10474 -1.07620
11 0.40437 -0.23879 0.55211 -1.38020 -1.91780
15 2.22866 -1.09516 0.58734 -0.76878 1.37438
In [9]:
%time
res = data[['No1', 'No2']][((data['No1'] > 0.5) | (data['No1'] < -0.5)) & ((data['No2'] < -1) | (data['No2'] > 1))]
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 8.58 µs
In [10]:
plt.plot(res.No1, res.No2, 'ro')
plt.grid(True); 
In [11]:
#########################################
#           Data as CSV File            #
#########################################
%time data.to_csv(filename + '.csv')
CPU times: user 14.3 s, sys: 420 ms, total: 14.7 s
Wall time: 14.7 s
In [12]:
pd.read_csv(filename + '.csv')[['No1', 'No2', 'No3', 'No4']].hist(bins=20)
Out[12]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f4abca367b8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f4abca6f5f8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f4abc1ca6d8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f4aa60c47f0>]], dtype=object)
In [13]:
#########################################
#           Data as EXCEL File          #
#########################################
%time data[:100000].to_excel(filename + '.xlsx')
CPU times: user 19.3 s, sys: 132 ms, total: 19.4 s
Wall time: 19.4 s
In [ ]:
%%time 
pd.read_excel(filename + '.xlsx', 'Sheet1').cumsum().plot()
CPU times: user 13.4 s, sys: 31.9 ms, total: 13.5 s
Wall time: 13.5 s
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f4abbeebf98>
In [ ]:
%time pd.read_excel(filename + '.xlsx', 'Sheet1').cumsum().plot()
In [ ]:
ll $path*
In [ ]:
#########################################
#      Fast I/O with PyTables           #
#########################################
import numpy as np
import tables as tb 
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
filename = path + 'tab.h5'
h5 = tb.open_file(filename, 'w')
rows = 2000000
row_des = {
    'Date': tb.StringCol(26, pos=1)
    'No1': tb.IntCol(col=2), 
    
}
In [ ]:
 


Tagged on: ,

Leave a Reply