1
2
%matplotlib inline
%pylab inline
Populating the interactive namespace from numpy and matplotlib
1
2
3
4
from pandas import Series, DataFrame
import numpy as np
import pandas as pd
from numpy import random

pandas数据结构

Series

1
2
obj = Series(np.arange(1,11))
obj
0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
dtype: int64
1
obj.values
array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
1
obj.index
RangeIndex(start=0, stop=10, step=1)
1
obj[1]
2
1
obj[obj%2==0]
1     2
3     4
5     6
7     8
9    10
dtype: int64
1
sdata = {'Ohio':35000, 'Texas': 71000, 'Oregon':16000, 'Utah': 5000}
1
sdata
{'Ohio': 35000, 'Oregen': 16000, 'Texas': 71000, 'Utah': 5000}
1
obj3 = Series(sdata)
1
obj3
Ohio      35000
Oregen    16000
Texas     71000
Utah       5000
dtype: int64
1
2
3
states = ['California','Ohio', 'Oregon','Texas']
obj4 = Series(sdata, index = states)
obj4
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
1
pd.isnull(obj4)
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
1
pd.notnull(obj4)
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool
1
obj4.isnull()
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
1
obj4 + obj3
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64
1
obj4.name = 'population'
1
2
1
obj4.index.name = 'state'
1
obj4
state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64
1
obj.index = np.arange(1,11)
1
obj
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
dtype: int64

DataFrame

类似R中的data.frame

1
2
3
4
5
6
data = {'state':['Ohio', 'Ohio','Ohio','Nevada','Nevada'],
'year': [2000,2001,2002, 2001, 2002],
'pop':[1.5, 1.7, 3.6,2.4,2.9]
}
frame = DataFrame(data)
1
frame










































pop state year
0 1.5 Ohio 2000
1 1.7 Ohio 2001
2 3.6 Ohio 2002
3 2.4 Nevada 2001
4 2.9 Nevada 2002

1
DataFrame(data, columns=['year', 'state', 'pop'])










































year state pop
0 2000 Ohio 1.5
1 2001 Ohio 1.7
2 2002 Ohio 3.6
3 2001 Nevada 2.4
4 2002 Nevada 2.9

1
2
3
frame2 = DataFrame(data, columns=['year', 'state', 'pop','debt'],
index = ['one','two','three','four','five'])
frame2
















































year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN

1
frame2.columns
Index(['year', 'state', 'pop', 'debt'], dtype='object')
1
frame2['state']
0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object
1
frame2['year']
0    2000
1    2001
2    2002
3    2001
4    2002
Name: year, dtype: int64
1
frame2['year']
1
frame2.year
one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64
1
frame2.ix['three']
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object
1
frame2['debt'] = 16.5
1
frame2
















































year state pop debt
one 2000 Ohio 1.5 16.5
two 2001 Ohio 1.7 16.5
three 2002 Ohio 3.6 16.5
four 2001 Nevada 2.4 16.5
five 2002 Nevada 2.9 16.5

1
frame2['debt'] = np.arange(5.)
1
frame2
















































year state pop debt
one 2000 Ohio 1.5 0.0
two 2001 Ohio 1.7 1.0
three 2002 Ohio 3.6 2.0
four 2001 Nevada 2.4 3.0
five 2002 Nevada 2.9 4.0

1
val = Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five'])
1
frame2['debt'] = val
1
frame2
















































year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2002 Nevada 2.9 -1.7

index对象

1
2
3
obj = Series(range(3), index=['a','b','c'])
index = obj.index
index
Index(['a', 'b', 'c'], dtype='object')

index对象是immutable

1
2
3
index = pd.Index(np.arange(3))
obj2 = Series([1.5, -2.5,0],index=index)
obj2.index is obj2.index
True

基本功能

1
obj = Series([4.5, 7.2, -5.3, 3.6], index=['b','d', 'a','c'])
1
obj
b    4.5
d    7.2
a   -5.3
c    3.6
dtype: float64
1
obj2 = obj.reindex(['a','b','c','d','e'])
1
obj2
a   -5.3
b    4.5
c    3.6
d    7.2
e    NaN
dtype: float64
1
2
obj3 = Series(['blue', 'purple','yellow'], index = [0, 3, 4])
obj3
0      blue
3    purple
4    yellow
dtype: object
1
obj3.reindex(np.arange(6), method='bfill')
0      blue
1    purple
2    purple
3    purple
4    yellow
5       NaN
dtype: object
1
2
3
frame = DataFrame(np.arange(9).reshape(3,3), index = ['a', 'c','b'],
columns=['Ohio', 'Texas','California'])
frame






























Ohio Texas California
a 0 1 2
c 3 4 5
b 6 7 8

1
frame2 = frame.reindex(['a','b', 'c','d'])
1
frame2




































Ohio Texas California
a 0.0 1.0 2.0
b 6.0 7.0 8.0
c 3.0 4.0 5.0
d NaN NaN NaN

1
2
3
states = ['Texas', 'Utah','California']
frame3 = frame.reindex(columns=states)
frame3






























Texas Utah California
a 1 NaN 2
c 4 NaN 5
b 7 NaN 8

1
frame3.index
Index(['a', 'c', 'b'], dtype='object')
1
frame5 = frame.ix[['a', 'b','c','d'], states]
1
2
new_obj = frame5.drop('d')
new_obj






























Texas Utah California
a 1.0 NaN 2.0
b 7.0 NaN 8.0
c 4.0 NaN 5.0

1
frame5.drop('Utah', axis=1)































Texas California
a 1.0 2.0
b 7.0 8.0
c 4.0 5.0
d NaN NaN

索引、选取和过滤

Series索引的工作方式类似于Numpy数据组的索引

1
2
obj = Series(np.arange(4.), index = ['a','b','c','d'])
obj
a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64
1
obj[2:4]
c    2.0
d    3.0
dtype: float64
1
obj['a']
0.0
1
obj[[3,2]]
d    3.0
c    2.0
dtype: float64
1
obj[['a','d','c']]
a    0.0
d    3.0
c    2.0
dtype: float64
1
obj[obj%2==0]
a    0.0
c    2.0
dtype: float64

使用标签索引和Python不同的是, 末尾是包含的

1
obj["a":'c']
a    0.0
b    1.0
c    2.0
dtype: float64

DataFrame索引

1
frame5




































Texas Utah California
a 1.0 NaN 2.0
b 7.0 NaN 8.0
c 4.0 NaN 5.0
d NaN NaN NaN

1
frame5['Texas']
a    1.0
b    7.0
c    4.0
d    NaN
Name: Texas, dtype: float64
1
frame[frame%2 == 0]






























Ohio Texas California
a 0.0 NaN 2.0
c NaN 4.0 NaN
b 6.0 NaN 8.0

1
frame5.ix[2]
Texas         4.0
Utah          NaN
California    5.0
Name: c, dtype: float64
1
frame5.ix[0]
Texas         1.0
Utah          NaN
California    2.0
Name: a, dtype: float64
1
frame5.ix[['a','c'],'Texas']
a    1.0
c    4.0
Name: Texas, dtype: float64
1
frame5.ix[['a','d'],['Texas', 'Ohio']]





















Texas Ohio
a 1.0 NaN
d NaN NaN

1
frame5.Texas
a    1.0
b    7.0
c    4.0
d    NaN
Name: Texas, dtype: float64
1
frame5.ix[frame5.Texas%2==1,0]
a    1.0
b    7.0
Name: Texas, dtype: float64

DataFrame的索引选项

  • obj[val] 选取列
  • obj.ix[val]选取行
  • obj.ix[:, val] 选取列
  • obj.ix[val1, val2] 同时选取行列
  • reindex 将一个或多个轴匹配到新索引

算数运算和数据对齐

1
2
3
s1 = Series(np.arange(4.0), index = ['a', 'b','c','d'])
s2 = Series(np.arange(5.0), index = ['a','c','e','g','i'])
s1
a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64
1
s2
a    0.0
c    1.0
e    2.0
g    3.0
i    4.0
dtype: float64
1
s1 + s2
a    0.0
b    NaN
c    3.0
d    NaN
e    NaN
g    NaN
i    NaN
dtype: float64

对于DataFrame对齐会同时发生在行和列上

1
2
df1 = DataFrame(np.arange(9.).reshape(3, 3), columns=list('bcd'), index=["A", "B", "C"])
df2 = DataFrame(randn(16).reshape(4,4), columns=list('bdce'), index=["A", "D", "C", "E"])
1
df1






























b c d
A 0.0 1.0 2.0
B 3.0 4.0 5.0
C 6.0 7.0 8.0

1
df2









































b d c e
A 0.303728 -0.716209 -1.532537 -1.234941
D -0.400836 -1.983266 0.017150 -0.678466
C -1.801502 0.125058 1.214294 -0.049006
E 0.764905 -1.760736 -0.494800 0.389546

1
df1 + df2
















































b c d e
A 0.303728 -0.532537 1.283791 NaN
B NaN NaN NaN NaN
C 4.198498 8.214294 8.125058 NaN
D NaN NaN NaN NaN
E NaN NaN NaN NaN

算数方法填充空缺值

1
df1.add(df2, fill_value= 0)
















































b c d e
A 0.303728 -0.532537 1.283791 -1.234941
B 3.000000 4.000000 5.000000 NaN
C 4.198498 8.214294 8.125058 -0.049006
D -0.400836 0.017150 -1.983266 -0.678466
E 0.764905 -0.494800 -1.760736 0.389546

1
df2.add(df1, fill_value= 0)
















































b c d e
A 0.303728 -0.532537 1.283791 -1.234941
B 3.000000 4.000000 5.000000 NaN
C 4.198498 8.214294 8.125058 -0.049006
D -0.400836 0.017150 -1.983266 -0.678466
E 0.764905 -0.494800 -1.760736 0.389546

DataFrameSeries之间的运算

1
2
arr = np.arange(12).reshape(3, 4)
arr
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
1
arr[[1,2]]
array([[ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
1
df1






























b c d
A 0.0 1.0 2.0
B 3.0 4.0 5.0
C 6.0 7.0 8.0

1
df2









































b d c e
A 0.303728 -0.716209 -1.532537 -1.234941
D -0.400836 -1.983266 0.017150 -0.678466
C -1.801502 0.125058 1.214294 -0.049006
E 0.764905 -1.760736 -0.494800 0.389546

1
2
3
# 在行上运算
s1 = df1.ix[0]
df1 - s1






























b c d
A 0.0 0.0 0.0
B 3.0 3.0 3.0
C 6.0 6.0 6.0

1
2
s2 = df1.b
s2
A    0.0
B    3.0
C    6.0
Name: b, dtype: float64
1
df1.sub(s2,axis = 0)






























b c d
A 0.0 1.0 2.0
B 0.0 1.0 2.0
C 0.0 1.0 2.0

函数应用和映射

1
2
3
frame = DataFrame(random.randn(4, 3), columns = ['b', 'd','e'],
index=['Utah','Ohio', 'Texas','California'])
frame




































b d e
Utah -0.390809 -0.628092 0.973094
Ohio 1.382407 0.610332 -1.573332
Texas 0.407925 0.607034 1.702876
California 0.308539 -0.437947 0.215211

1
f = lambda x: x.max() - x.min()
1
frame.apply(f)
b    1.773216
d    1.238424
e    3.276208
dtype: float64

在R中得实现方法:

1
2
f = function(x) max(x) - min(x)
apply(frame, 2, f)

1
frame.apply(f, axis=1)
Utah          1.601187
Ohio          2.955739
Texas         1.294951
California    0.746486
dtype: float64

在R中得实现方法:

1
apply(frame, 1, f)

1
2
def f(x):
return Series([x.min(), x.max()], index = ['min', 'max'])
1
frame.apply(f)
























b d e
min -0.390809 -0.628092 -1.573332
max 1.382407 0.610332 1.702876

1
2
3
f = function(x) c( max = max(x), min = min(x) )
apply(frame, 1, f)
apply(frame, 2, f)

applymap可以进行元素级的应用函数

1
2
format = lambda x: '%.2f' %x
frame.applymap(format)




































b d e
Utah -0.39 -0.63 0.97
Ohio 1.38 0.61 -1.57
Texas 0.41 0.61 1.70
California 0.31 -0.44 0.22

对应R代码:

1
2
format <- funtion(x)round(x, digits = 2)
apply(frame ,c(1, 2), format)

排名和排序

1
2
s1 = Series(randn(5), index = list('acdbe'))
s1
a   -1.400520
c   -0.396774
d   -1.836984
b   -0.202243
e    0.657407
dtype: float64
1
s1.sort_index()
a   -1.400520
b   -0.202243
c   -0.396774
d   -1.836984
e    0.657407
dtype: float64
1
s1.sort_values()
d   -1.836984
a   -1.400520
c   -0.396774
b   -0.202243
e    0.657407
dtype: float64
1
s1.order()
/Users/shihchosen/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: order is deprecated, use sort_values(...)
  if __name__ == '__main__':





d   -1.836984
a   -1.400520
c   -0.396774
b   -0.202243
e    0.657407
dtype: float64
1
frame.sort_index()




































b d e
California 0.308539 -0.437947 0.215211
Ohio 1.382407 0.610332 -1.573332
Texas 0.407925 0.607034 1.702876
Utah -0.390809 -0.628092 0.973094

1
frame.sort_values(by = "b", ascending=False)




































b d e
Ohio 1.382407 0.610332 -1.573332
Texas 0.407925 0.607034 1.702876
California 0.308539 -0.437947 0.215211
Utah -0.390809 -0.628092 0.973094

Rank

1
s1
a   -1.400520
c   -0.396774
d   -1.836984
b   -0.202243
e    0.657407
dtype: float64
1
s1.rank()
a    2.0
c    3.0
d    1.0
b    4.0
e    5.0
dtype: float64

如果有数值一样的项时, 会分配一个平均排名, 通过method参数可以指定哪一个小

1
2
s2 = Series([2, 4, 3, 1,4])
s2.rank()# (4+5)/2 = 4.5
0    2.0
1    4.5
2    3.0
3    1.0
4    4.5
dtype: float64
1
s2.rank(method="first")
0    2.0
1    4.0
2    3.0
3    1.0
4    5.0
dtype: float64

破坏评级结构的method选项:

  1. average 默认 平均
  2. min 使用整个分组最小的排名
  3. max 使用整个分组的最大排名
  4. first按照出现次序排序

对总和计算描述统计

1
frame.sum()
b    1.708061
d    0.151327
e    1.317850
dtype: float64
1
frame.sum(axis=1)
Utah         -0.045807
Ohio          0.419407
Texas         2.717835
California    0.085803
dtype: float64
1
frame.cumsum()




































b d e
Utah -0.390809 -0.628092 0.973094
Ohio 0.991598 -0.017760 -0.600237
Texas 1.399522 0.589274 1.102639
California 1.708061 0.151327 1.317850

1
frame.cumsum(axis=1)




































b d e
Utah -0.390809 -1.018901 -0.045807
Ohio 1.382407 1.992739 0.419407
Texas 0.407925 1.014959 2.717835
California 0.308539 -0.129408 0.085803

1
frame.describe()




























































b d e
count 4.000000 4.000000 4.000000
mean 0.427015 0.037832 0.329462
std 0.729386 0.663719 1.406438
min -0.390809 -0.628092 -1.573332
25% 0.133702 -0.485483 -0.231925
50% 0.358232 0.084544 0.594153
75% 0.651545 0.607859 1.155540
max 1.382407 0.610332 1.702876

相关系数与协方差

1
2
3
4
import pandas_datareader.data as web
all_data = {}
for ticker in ['BABA', 'BIDU', "JD"]:
all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2016', '9/1/2016')
1
2
volume = DataFrame({tic: data["Volume"] for tic, data in all_data.items()})
volume.head()
















































BABA BIDU JD
Date
2016-01-04 23066300 4328000 18265300
2016-01-05 14258900 3367900 9426400
2016-01-06 11569300 2280600 12988900
2016-01-07 27288100 7009100 18155700
2016-01-08 20814600 4682800 15164100

1
2
price = DataFrame({tic: data["Adj Close"] for tic, data in all_data.items()})
price.head()
















































BABA BIDU JD
Date
2016-01-04 76.690002 184.029999 29.530001
2016-01-05 78.629997 187.369995 30.000000
2016-01-06 77.330002 185.610001 29.760000
2016-01-07 72.720001 174.369995 27.950001
2016-01-08 70.800003 171.339996 27.620001

1
2
returns = price.pct_change()
returns.tail()
















































BABA BIDU JD
Date
2016-08-26 0.011922 0.003587 0.014556
2016-08-29 -0.001894 0.001038 -0.006979
2016-08-30 0.011804 0.010366 -0.000781
2016-08-31 0.012396 -0.024964 -0.007034
2016-09-01 0.002366 0.032034 0.011806

1
returns.BABA.corr(returns.JD)
0.63037358921035291
1
returns.BABA.corr(returns.BIDU)
0.49956986052264424
1
returns.corr()






























BABA BIDU JD
BABA 1.000000 0.499570 0.630374
BIDU 0.499570 1.000000 0.586531
JD 0.630374 0.586531 1.000000

1
returns.corrwith(returns.BABA)
BABA    1.000000
BIDU    0.499570
JD      0.630374
dtype: float64

unique,count

1
2
obj = Series(['a', 'c','c', 'a', 'b', 'd', 'e','A'])
obj.unique()
array(['a', 'c', 'b', 'd', 'e', 'A'], dtype=object)
1
obj.value_counts()
a    2
c    2
b    1
A    1
e    1
d    1
dtype: int64
1
obj.isin(['b', 'f'])
0    False
1    False
2    False
3    False
4     True
5    False
6    False
7    False
dtype: bool

处理确实数据

1
2
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object
1
string_data.isnull()
0    False
1    False
2     True
3    False
dtype: bool
  1. dropna
  2. fillna
  3. isnull
  4. notnull
1
2
3
from numpy import nan as NA
data = Series([1, NA, 3.5, NA, 7])
data
0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64
1
data.dropna()
0    1.0
2    3.5
4    7.0
dtype: float64

DataFrame使用drop.na会把所有把包含nan的行删除

1
2
df3 = df1 + df2
df3
















































b c d e
A 0.303728 -0.532537 1.283791 NaN
B NaN NaN NaN NaN
C 4.198498 8.214294 8.125058 NaN
D NaN NaN NaN NaN
E NaN NaN NaN NaN

1
df3.dropna()













b c d e

参数how只会丢弃全部是na的行或列

1
df3.dropna(how = "all")



























b c d e
A 0.303728 -0.532537 1.283791 NaN
C 4.198498 8.214294 8.125058 NaN

1
df3.dropna(how = 'all', axis = 1)










































b c d
A 0.303728 -0.532537 1.283791
B NaN NaN NaN
C 4.198498 8.214294 8.125058
D NaN NaN NaN
E NaN NaN NaN

thresh参数

1
df3.dropna(how = "all", thresh=3)



























b c d e
A 0.303728 -0.532537 1.283791 NaN
C 4.198498 8.214294 8.125058 NaN

补全缺失值

1
df3.fillna(0)
















































b c d e
A 0.303728 -0.532537 1.283791 0.0
B 0.000000 0.000000 0.000000 0.0
C 4.198498 8.214294 8.125058 0.0
D 0.000000 0.000000 0.000000 0.0
E 0.000000 0.000000 0.000000 0.0

1
2
df3.fillna(0, inplace=True)
df3
















































b c d e
A 0.303728 -0.532537 1.283791 0.0
B 0.000000 0.000000 0.000000 0.0
C 4.198498 8.214294 8.125058 0.0
D 0.000000 0.000000 0.000000 0.0
E 0.000000 0.000000 0.000000 0.0

层次化索引

1
2
3
data = Series(np.random.randn(10),
index = [['a', 'a', 'a', 'b', 'b', 'b','c','c','c','d' ],
[1, 2, 3,1, 2, 3,1, 2, 3,1]])
1
data
a  1    0.838350
   2    0.191902
   3   -0.617628
b  1   -0.738438
   2   -0.230139
   3    1.577543
c  1   -0.313966
   2   -0.682991
   3    0.671963
d  1    1.315139
dtype: float64
1
data.index
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 3], [0, 1, 2, 0, 1, 2, 0, 1, 2, 0]])
1
data['a']
1    0.838350
2    0.191902
3   -0.617628
dtype: float64
1
data[['a','d']]
a  1    0.838350
   2    0.191902
   3   -0.617628
d  1    1.315139
dtype: float64
1
data['a':'c']
a  1    0.838350
   2    0.191902
   3   -0.617628
b  1   -0.738438
   2   -0.230139
   3    1.577543
c  1   -0.313966
   2   -0.682991
   3    0.671963
dtype: float64
1
data[:,1]
a    0.838350
b   -0.738438
c   -0.313966
d    1.315139
dtype: float64
1
2
data1 = data.unstack()
data1




































1 2 3
a 0.838350 0.191902 -0.617628
b -0.738438 -0.230139 1.577543
c -0.313966 -0.682991 0.671963
d 1.315139 NaN NaN

1
data1.stack()
a  1    0.838350
   2    0.191902
   3   -0.617628
b  1   -0.738438
   2   -0.230139
   3    1.577543
c  1   -0.313966
   2   -0.682991
   3    0.671963
d  1    1.315139
dtype: float64

数据加载、存储与文件格式

读取文本格式的数据

1
2
zip_codes = pd.read_csv("/Users/shihchosen/Documents/Intern/lyzs/temp/shiny-examples/063-superzip-example/data/zip_codes_states.csv")
zip_codes




















































































































































































































































































































































































































































































































































































zip_code latitude longitude city state county
0 501 40.922326 -72.637078 Holtsville NY Suffolk
1 544 40.922326 -72.637078 Holtsville NY Suffolk
2 601 18.165273 -66.722583 Adjuntas PR Adjuntas
3 602 18.393103 -67.180953 Aguada PR Aguada
4 603 18.455913 -67.145780 Aguadilla PR Aguadilla
5 604 18.493520 -67.135883 Aguadilla PR Aguadilla
6 605 18.465162 -67.141486 Aguadilla PR Aguadilla
7 606 18.172947 -66.944111 Maricao PR Maricao
8 610 18.288685 -67.139696 Anasco PR Anasco
9 611 18.279531 -66.802170 Angeles PR Utuado
10 612 18.450674 -66.698262 Arecibo PR Arecibo
11 613 18.458093 -66.732732 Arecibo PR Arecibo
12 614 18.429675 -66.674506 Arecibo PR Arecibo
13 616 18.444792 -66.640678 Bajadero PR Arecibo
14 617 18.447092 -66.544255 Barceloneta PR Barceloneta
15 622 17.998531 -67.187318 Boqueron PR Cabo Rojo
16 623 18.062201 -67.149541 Cabo Rojo PR Cabo Rojo
17 624 18.023535 -66.726156 Penuelas PR Penuelas
18 627 18.477891 -66.854770 Camuy PR Camuy
19 631 18.269187 -66.864993 Castaner PR Lares
20 636 18.113284 -67.039706 Rosario PR San German
21 637 18.087322 -66.934911 Sabana Grande PR Sabana Grande
22 638 18.336160 -66.472087 Ciales PR Ciales
23 641 18.250027 -66.698957 Utuado PR Utuado
24 646 18.436060 -66.281954 Dorado PR Dorado
25 647 17.969594 -66.939754 Ensenada PR Guanica
26 650 18.360125 -66.562311 Florida PR Florida
27 652 18.457254 -66.603358 Garrochales PR Arecibo
28 653 17.972468 -66.898661 Guanica PR Guanica
29 656 18.023280 -66.786909 Guayanilla PR Guayanilla
42711 99803 58.377074 -134.617383 Juneau AK Juneau
42712 99811 58.383480 -134.197786 Juneau AK Juneau
42713 99820 57.500525 -134.585954 Angoon AK Skagway Hoonah Angoon
42714 99821 58.449413 -134.700348 Auke Bay AK Juneau
42715 99824 58.276308 -134.397642 Douglas AK Juneau
42716 99825 58.331798 -135.181256 Elfin Cove AK Skagway Hoonah Angoon
42717 99826 58.331798 -135.181256 Gustavus AK Skagway Hoonah Angoon
42718 99827 59.312688 -135.681153 Haines AK Haines
42719 99829 58.097655 -135.431561 Hoonah AK Skagway Hoonah Angoon
42720 99830 56.973554 -133.935996 Kake AK Wrangell Petersburg
42721 99832 58.331798 -135.181256 Pelican AK Skagway Hoonah Angoon
42722 99833 56.767056 -132.931950 Petersburg AK Wrangell Petersburg
42723 99835 57.080818 -135.318348 Sitka AK Sitka
42724 99836 56.535950 -132.827948 Port Alexander AK Wrangell Petersburg
42725 99840 59.456753 -135.314852 Skagway AK Skagway Hoonah Angoon
42726 99841 57.798249 -135.285061 Tenakee Springs AK Skagway Hoonah Angoon
42727 99850 58.383480 -134.197786 Juneau AK Juneau
42728 99901 55.400674 -131.674090 Ketchikan AK Ketchikan Gateway
42729 99903 55.517921 -132.003244 Meyers Chuck AK Prince Wales Ketchikan
42730 99918 55.517921 -132.003244 Coffman Cove AK Prince Wales Ketchikan
42731 99919 55.126774 -131.580407 Thorne Bay AK Prince Wales Ketchikan
42732 99921 55.513301 -133.117022 Craig AK Prince Wales Ketchikan
42733 99922 55.208706 -132.825903 Hydaburg AK Prince Wales Ketchikan
42734 99923 55.517921 -132.003244 Hyder AK Prince Wales Ketchikan
42735 99925 55.554393 -133.090680 Klawock AK Prince Wales Ketchikan
42736 99926 55.094325 -131.566827 Metlakatla AK Prince Wales Ketchikan
42737 99927 55.517921 -132.003244 Point Baker AK Prince Wales Ketchikan
42738 99928 55.395359 -131.675370 Ward Cove AK Ketchikan Gateway
42739 99929 56.449893 -132.364407 Wrangell AK Wrangell Petersburg
42740 99950 55.542007 -131.432682 Ketchikan AK Ketchikan Gateway

42741 rows × 6 columns


1
names = ['zip_code', 'latitude', 'longitude', 'city', 'state', 'county']
1
2
pd.read_csv("/Users/shihchosen/Documents/Intern/lyzs/temp/shiny-examples/063-superzip-example/data/zip_codes_states.csv",
sep=",", names= names, index_col = ['zip_code','city'])






























































































































































































































































































































































































































































































































latitude longitude state county
zip_code city
zip_code city latitude longitude state county
00501 Holtsville 40.922326 -72.637078 NY Suffolk
00544 Holtsville 40.922326 -72.637078 NY Suffolk
00601 Adjuntas 18.165273 -66.722583 PR Adjuntas
00602 Aguada 18.393103 -67.180953 PR Aguada
00603 Aguadilla 18.455913 -67.14578 PR Aguadilla
00604 Aguadilla 18.49352 -67.135883 PR Aguadilla
00605 Aguadilla 18.465162 -67.141486 PR Aguadilla
00606 Maricao 18.172947 -66.944111 PR Maricao
00610 Anasco 18.288685 -67.139696 PR Anasco
00611 Angeles 18.279531 -66.80217 PR Utuado
00612 Arecibo 18.450674 -66.698262 PR Arecibo
00613 Arecibo 18.458093 -66.732732 PR Arecibo
00614 Arecibo 18.429675 -66.674506 PR Arecibo
00616 Bajadero 18.444792 -66.640678 PR Arecibo
00617 Barceloneta 18.447092 -66.544255 PR Barceloneta
00622 Boqueron 17.998531 -67.187318 PR Cabo Rojo
00623 Cabo Rojo 18.062201 -67.149541 PR Cabo Rojo
00624 Penuelas 18.023535 -66.726156 PR Penuelas
00627 Camuy 18.477891 -66.85477 PR Camuy
00631 Castaner 18.269187 -66.864993 PR Lares
00636 Rosario 18.113284 -67.039706 PR San German
00637 Sabana Grande 18.087322 -66.934911 PR Sabana Grande
00638 Ciales 18.33616 -66.472087 PR Ciales
00641 Utuado 18.250027 -66.698957 PR Utuado
00646 Dorado 18.43606 -66.281954 PR Dorado
00647 Ensenada 17.969594 -66.939754 PR Guanica
00650 Florida 18.360125 -66.562311 PR Florida
00652 Garrochales 18.457254 -66.603358 PR Arecibo
00653 Guanica 17.972468 -66.898661 PR Guanica
99803 Juneau 58.377074 -134.617383 AK Juneau
99811 Juneau 58.38348 -134.197786 AK Juneau
99820 Angoon 57.500525 -134.585954 AK Skagway Hoonah Angoon
99821 Auke Bay 58.449413 -134.700348 AK Juneau
99824 Douglas 58.276308 -134.397642 AK Juneau
99825 Elfin Cove 58.331798 -135.181256 AK Skagway Hoonah Angoon
99826 Gustavus 58.331798 -135.181256 AK Skagway Hoonah Angoon
99827 Haines 59.312688 -135.681153 AK Haines
99829 Hoonah 58.097655 -135.431561 AK Skagway Hoonah Angoon
99830 Kake 56.973554 -133.935996 AK Wrangell Petersburg
99832 Pelican 58.331798 -135.181256 AK Skagway Hoonah Angoon
99833 Petersburg 56.767056 -132.931950 AK Wrangell Petersburg
99835 Sitka 57.080818 -135.318348 AK Sitka
99836 Port Alexander 56.53595 -132.827948 AK Wrangell Petersburg
99840 Skagway 59.456753 -135.314852 AK Skagway Hoonah Angoon
99841 Tenakee Springs 57.798249 -135.285061 AK Skagway Hoonah Angoon
99850 Juneau 58.38348 -134.197786 AK Juneau
99901 Ketchikan 55.400674 -131.674090 AK Ketchikan Gateway
99903 Meyers Chuck 55.517921 -132.003244 AK Prince Wales Ketchikan
99918 Coffman Cove 55.517921 -132.003244 AK Prince Wales Ketchikan
99919 Thorne Bay 55.126774 -131.580407 AK Prince Wales Ketchikan
99921 Craig 55.513301 -133.117022 AK Prince Wales Ketchikan
99922 Hydaburg 55.208706 -132.825903 AK Prince Wales Ketchikan
99923 Hyder 55.517921 -132.003244 AK Prince Wales Ketchikan
99925 Klawock 55.554393 -133.090680 AK Prince Wales Ketchikan
99926 Metlakatla 55.094325 -131.566827 AK Prince Wales Ketchikan
99927 Point Baker 55.517921 -132.003244 AK Prince Wales Ketchikan
99928 Ward Cove 55.395359 -131.675370 AK Ketchikan Gateway
99929 Wrangell 56.449893 -132.364407 AK Wrangell Petersburg
99950 Ketchikan 55.542007 -131.432682 AK Ketchikan Gateway

42742 rows × 4 columns


逐块读取文本文件

1
2
pd.read_csv("/Users/shihchosen/Documents/Intern/lyzs/temp/shiny-examples/063-superzip-example/data/zip_codes_states.csv",
nrows = 5)




























































zip_code latitude longitude city state county
0 501 40.922326 -72.637078 Holtsville NY Suffolk
1 544 40.922326 -72.637078 Holtsville NY Suffolk
2 601 18.165273 -66.722583 Adjuntas PR Adjuntas
3 602 18.393103 -67.180953 Aguada PR Aguada
4 603 18.455913 -67.145780 Aguadilla PR Aguadilla

将数据写出到文本格式

1
zip_codes.to_csv("/Users/shihchosen/Downloads/zip_codes.csv")
1
2
data = pd.read_csv('ch06/ex5.csv')
data










































something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo

1
data.to_csv(sys.stdout, na_rep="NULL")
,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo

JSON数据

1
2
3
import json
f = open('ch06/nations.json')
result = json.load(f)
1
result
[{'income': [[1800, 359.93],
   [1820, 359.93],
   [1913, 556.12],
   [1950, 3363.02],
   [1951, 3440.9],
   [1952, 3520.61],
   [1953, 3598.81],
   [1954, 3450.82],
   [1955, 3672.08],
   [1956, 3549.04],
   [1957, 3827.94],
   [1958, 3966.42],
   [1959, 3917.76],
   [1960, 4006.21],
   [1961, 4463.83],
   [1962, 4269.28],
   [1963, 4413.6],
   [1964, 4826.49],
   [1965, 5102.21],
   [1966, 5308.14],
   [1967, 5522.78],
   [1968, 5346.63],
   [1969, 5408.12],
   [1970, 5651.88],
   [1971, 5526.21],
   [1972, 5473.29],
   [1973, 5722.02],
   [1974, 5470.21],
   [1975, 3430.85],
   [1976, 3050.32],
   [1977, 3008.65],
   [1978, 3070.82],
   [1979, 3064.89],
   [1980, 3074.75],
   [1981, 2953.41],
   [1982, 2756.95],
   [1983, 2584.56],
   [1984, 2527.47],
   [1985, 2492.83],
   [1986, 2220.61],
   [1987, 2430.21],
   [1988, 2728.53],
   [1989, 2730.56],
   [1990, 2777.42],
   [1991, 2730.85],
   [1992, 2627.85],
   [1993, 1869.92],
   [1994, 1851.45],
   [1995, 1989.02],
   [1996, 2157.35],
   [1997, 2277.14],
   [1998, 2384.48],
   [1999, 2417.18],
   [2000, 2446.65],
   [2001, 2479.69],
   [2002, 2773.29],
   [2003, 2785.39],
   [2004, 3007.11],
   [2005, 3533],
   [2006, 4069.56],
   [2007, 4755.46],
   [2008, 5228.74],
   [2009, 5055.59]],
  'lifeExpectancy': [[1800, 26.98],
   [1940, 26.98],
   [1950, 29.22],
   [1951, 29.42],
   [1952, 29.81],
   [1953, 30.21],
   [1954, 30.6],
   [1955, 31],
   [1956, 31.4],
   [1957, 31.8],
   [1958, 32.2],
   [1959, 32.6],
   [1960, 33],
   [1961, 33.4],
   [1962, 33.8],
   [1963, 34.2],
   [1964, 34.6],
   [1965, 35],
   [1966, 35.4],
   [1967, 35.8],
   [1968, 36.2],
   [1969, 36.6],
   [1970, 37],
   [1971, 37.41],
   [1972, 37.83],
   [1973, 38.26],
   [1974, 38.68],
   [1975, 39.09],
   [1976, 39.46],
   [1977, 39.8],
   [1978, 40.1],
   [1979, 40.34],
   [1980, 40.55],
   [1981, 40.71],
   [1982, 40.85],
   [1983, 40.97],
   [1984, 41.08],
   [1985, 41.2],
   [1986, 41.33],
   [1987, 41.48],
   [1988, 41.64],
   [1989, 41.81],
   [1990, 41.99],
   [1991, 42.16],
   [1992, 42.32],
   [1993, 42.46],
   [1994, 42.59],
   [1995, 42.7],
   [1996, 42.82],
   [1997, 42.96],
   [1998, 43.12],
   [1999, 43.32],
   [2000, 43.56],
   [2001, 43.86],
   [2002, 44.22],
   [2003, 44.61],
   [2004, 45.05],
   [2005, 45.52],
   [2006, 46.02],
   [2007, 46.54],
   [2008, 47.06],
   [2009, 47.58]],
  'name': 'Angola',
  'population': [[1800, 1567028],
   [1820, 1567028],
   [1940, 3738000],
   [1950, 4117617],
   [1951, 4173095],
   [1952, 4232095],
   [1953, 4293840],
   [1954, 4357527],
   [1955, 4423223],
   [1956, 4490992],
   [1957, 4561361],
   [1958, 4635885],
   [1959, 4714676],
   [1960, 4797344],
   [1961, 4752489],
   [1962, 4826015],
   [1963, 4919586],
   [1964, 5026044],
   [1965, 5134818],
   [1966, 5201250],
   [1967, 5247469],
   [1968, 5350384],
   [1969, 5471641],
   [1970, 5605626],
   [1971, 5752775],
   [1972, 5894858],
   [1973, 6025841],
   [1974, 5986432],
   [1975, 5884241],
   [1976, 5942225],
   [1977, 6162675],
   [1978, 6285716],
   [1979, 6451227],
   [1980, 6741465],
   [1981, 6877697],
   [1982, 7016384],
   [1983, 7238214],
   [1984, 7439658],
   [1985, 7581504],
   [1986, 7744932],
   [1987, 7874230],
   [1988, 8018724],
   [1989, 8148595],
   [1990, 8290856],
   [1991, 8490763],
   [1992, 8735988],
   [1993, 8961438],
   [1994, 9170032],
   [1995, 9421477],
   [1996, 9660081],
   [1997, 9875024],
   [1998, 10071442],
   [1999, 10263229],
   [2000, 10442812],
   [2001, 10623424],
   [2002, 10866106],
   [2003, 11186202],
   [2004, 11521432],
   [2005, 11827315],
   [2006, 12127071],
   ...
   [1960, 1117376],
   [1961, 1131821],
   [1962, 1146757],
   [1963, 1162199],
   [1964, 1178277],
   [1965, 1195012],
   [1966, 1212426],
   [1967, 1230542],
   [1968, 1249383],
   [1969, 1269000],
   [1970, 1289420],
   [1971, 1310672],
   [1972, 1332786],
   [1973, 1355793],
   [1974, 1379655],
   [1975, 1404403],
   [1976, 1430069],
   [1977, 1456688],
   [1978, 1485467],
   [1979, 1516462],
   [1980, 1549589],
   [1981, 1584814],
   [1982, 1622136],
   [1983, 1661573],
   [1984, 1703154],
   [1985, 1746923],
   [1986, 1792933],
   [1987, 1841240],
   [1988, 1891894],
   [1989, 1937201],
   [1990, 1984449],
   [1991, 2041306],
   [1992, 2119465],
   [1993, 2205240],
   [1994, 2279436],
   [1995, 2341749],
   [1996, 2388865],
   [1997, 2444741],
   [1998, 2515309],
   [1999, 2590516],
   [2000, 2667859],
   [2001, 2747312],
   [2002, 2828858],
   [2003, 2912584],
   [2004, 2998563],
   ...
   [1961, 7367.29],
   [1962, 7133.17],
   [1963, 6852.54],
   [1964, 7452.89],
   [1965, 8022.44],
   [1991, 68639527],
  ...
   [1996, 66.15],
   [1997, 66.44],
   [1998, 66.73],
   [1999, 67.06],
   [2000, 67.42],
   [2001, 67.8],
   [2002, 68.18],
   [2003, 68.56],
   [2004, 68.92],
   [2005, 69.26],
   [2006, 69.58],
   [2007, 69.89],
   [2008, 70.2],
   [2009, 70.5]],
  'name': 'Vanuatu',
  'population': [[1800, 27791],
   [1820, 27791],
   [1950, 52000],
   [1951, 53262],
   [1952, 54554],
   ...
   [1998, 182882],
   [1999, 186282],
   [2000, 189618],
   [2001, 192910],
   [2002, 196178],
   [2003, 199414],
   [2004, 202609],
   [2005, 205754],
   [2006, 208869],
   [2007, 211971],
   [2008, 215053]],
  'region': 'East Asia & Pacific'}]
1
income = result[1]['income']
1
2
asjson = json.dumps(income)
income_df = DataFrame(income, columns=["Year","Income"])
1
income_df




























































































































































































































































































































Year Income
0 1800 553.72
1 1820 553.72
2 1913 855.53
3 1950 1104.47
4 1951 1083.57
5 1952 1062.75
6 1953 1012.84
7 1954 1021.29
8 1955 1000.66
9 1956 980.06
10 1957 959.60
11 1958 974.04
12 1959 988.88
13 1960 996.93
14 1961 1006.39
15 1962 949.50
16 1963 970.34
17 1964 1012.25
18 1965 1039.34
19 1966 1051.67
20 1967 1035.83
21 1968 1048.78
22 1969 1052.80
23 1970 1047.17
24 1971 1024.93
25 1972 1085.80
26 1973 1081.87
27 1974 973.64
28 1975 987.96
29 1976 1002.19
33 1980 1154.23
34 1981 1186.70
35 1982 1277.90
36 1983 1187.14
37 1984 1243.82
38 1985 1297.16
39 1986 1285.08
40 1987 1225.86
41 1988 1222.90
42 1989 1158.34
43 1990 1165.47
44 1991 1181.65
45 1992 1191.21
46 1993 1177.91
47 1994 1168.53
48 1995 1176.67
49 1996 1202.15
50 1997 1232.98
51 1998 1251.88
52 1999 1272.80
53 2000 1307.57
54 2001 1333.30
55 2002 1372.88
56 2003 1386.05
57 2004 1389.13
58 2005 1390.00
59 2006 1402.94
60 2007 1428.15
61 2008 1459.07
62 2009 1457.57

63 rows × 2 columns