1
2
%matplotlib inline
%pylab inline
Populating the interactive namespace from numpy and matplotlib
1
2
3
4
from pandas import Series, DataFrame
import numpy as np
import pandas as pd
from numpy import random
pandas
数据结构Series
1
2
obj = Series(np.arange(1 ,11 ))
obj
0 1
1 2
2 3
3 4
4 5
5 6
6 7
7 8
8 9
9 10
dtype: int64
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
RangeIndex(start=0, stop=10, step=1)
2
1 2
3 4
5 6
7 8
9 10
dtype: int64
1
sdata = {'Ohio' :35000 , 'Texas' : 71000 , 'Oregon' :16000 , 'Utah' : 5000 }
{'Ohio': 35000, 'Oregen': 16000, 'Texas': 71000, 'Utah': 5000}
Ohio 35000
Oregen 16000
Texas 71000
Utah 5000
dtype: int64
1
2
3
states = ['California' ,'Ohio' , 'Oregon' ,'Texas' ]
obj4 = Series(sdata, index = states)
obj4
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
dtype: float64
California True
Ohio False
Oregon False
Texas False
dtype: bool
California False
Ohio True
Oregon True
Texas True
dtype: bool
California True
Ohio False
Oregon False
Texas False
dtype: bool
California NaN
Ohio 70000.0
Oregon 32000.0
Texas 142000.0
Utah NaN
dtype: float64
1
obj4.name = 'population'
1
obj4.index.name = 'state'
state
California NaN
Ohio 35000.0
Oregon 16000.0
Texas 71000.0
Name: population, dtype: float64
1
obj.index = np.arange(1 ,11 )
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
dtype: int64
DataFrame
类似R
中的data.frame
1
2
3
4
5
6
data = {'state' :['Ohio' , 'Ohio' ,'Ohio' ,'Nevada' ,'Nevada' ],
'year' : [2000 ,2001 ,2002 , 2001 , 2002 ],
'pop' :[1.5 , 1.7 , 3.6 ,2.4 ,2.9 ]
}
frame = DataFrame(data)
pop state year 0 1.5 Ohio 2000 1 1.7 Ohio 2001 2 3.6 Ohio 2002 3 2.4 Nevada 2001 4 2.9 Nevada 2002
1
DataFrame(data, columns=['year' , 'state' , 'pop' ])
year state pop 0 2000 Ohio 1.5 1 2001 Ohio 1.7 2 2002 Ohio 3.6 3 2001 Nevada 2.4 4 2002 Nevada 2.9
1
2
3
frame2 = DataFrame(data, columns=['year' , 'state' , 'pop' ,'debt' ],
index = ['one' ,'two' ,'three' ,'four' ,'five' ])
frame2
year state pop debt one 2000 Ohio 1.5 NaN two 2001 Ohio 1.7 NaN three 2002 Ohio 3.6 NaN four 2001 Nevada 2.4 NaN five 2002 Nevada 2.9 NaN
Index(['year', 'state', 'pop', 'debt'], dtype='object')
0 Ohio
1 Ohio
2 Ohio
3 Nevada
4 Nevada
Name: state, dtype: object
0 2000
1 2001
2 2002
3 2001
4 2002
Name: year, dtype: int64
one 2000
two 2001
three 2002
four 2001
five 2002
Name: year, dtype: int64
year 2002
state Ohio
pop 3.6
debt NaN
Name: three, dtype: object
year state pop debt one 2000 Ohio 1.5 16.5 two 2001 Ohio 1.7 16.5 three 2002 Ohio 3.6 16.5 four 2001 Nevada 2.4 16.5 five 2002 Nevada 2.9 16.5
1
frame2['debt' ] = np.arange(5. )
year state pop debt one 2000 Ohio 1.5 0.0 two 2001 Ohio 1.7 1.0 three 2002 Ohio 3.6 2.0 four 2001 Nevada 2.4 3.0 five 2002 Nevada 2.9 4.0
1
val = Series([-1.2 , -1.5 , -1.7 ], index = ['two' , 'four' , 'five' ])
year state pop debt one 2000 Ohio 1.5 NaN two 2001 Ohio 1.7 -1.2 three 2002 Ohio 3.6 NaN four 2001 Nevada 2.4 -1.5 five 2002 Nevada 2.9 -1.7
index
对象1
2
3
obj = Series(range(3 ), index=['a' ,'b' ,'c' ])
index = obj.index
index
Index(['a', 'b', 'c'], dtype='object')
index
对象是immutable
1
2
3
index = pd.Index(np.arange(3 ))
obj2 = Series([1.5 , -2.5 ,0 ],index=index)
obj2.index is obj2.index
True
基本功能 1
obj = Series([4.5 , 7.2 , -5.3 , 3.6 ], index=['b' ,'d' , 'a' ,'c' ])
b 4.5
d 7.2
a -5.3
c 3.6
dtype: float64
1
obj2 = obj.reindex(['a' ,'b' ,'c' ,'d' ,'e' ])
a -5.3
b 4.5
c 3.6
d 7.2
e NaN
dtype: float64
1
2
obj3 = Series(['blue' , 'purple' ,'yellow' ], index = [0 , 3 , 4 ])
obj3
0 blue
3 purple
4 yellow
dtype: object
1
obj3.reindex(np.arange(6 ), method='bfill' )
0 blue
1 purple
2 purple
3 purple
4 yellow
5 NaN
dtype: object
1
2
3
frame = DataFrame(np.arange(9 ).reshape(3 ,3 ), index = ['a' , 'c' ,'b' ],
columns=['Ohio' , 'Texas' ,'California' ])
frame
Ohio Texas California a 0 1 2 c 3 4 5 b 6 7 8
1
frame2 = frame.reindex(['a' ,'b' , 'c' ,'d' ])
Ohio Texas California a 0.0 1.0 2.0 b 6.0 7.0 8.0 c 3.0 4.0 5.0 d NaN NaN NaN
1
2
3
states = ['Texas' , 'Utah' ,'California' ]
frame3 = frame.reindex(columns=states)
frame3
Texas Utah California a 1 NaN 2 c 4 NaN 5 b 7 NaN 8
Index(['a', 'c', 'b'], dtype='object')
1
frame5 = frame.ix[['a' , 'b' ,'c' ,'d' ], states]
1
2
new_obj = frame5.drop('d' )
new_obj
Texas Utah California a 1.0 NaN 2.0 b 7.0 NaN 8.0 c 4.0 NaN 5.0
1
frame5.drop('Utah' , axis=1 )
Texas California a 1.0 2.0 b 7.0 8.0 c 4.0 5.0 d NaN NaN
索引、选取和过滤 Series
索引的工作方式类似于Numpy数据组的索引
1
2
obj = Series(np.arange(4. ), index = ['a' ,'b' ,'c' ,'d' ])
obj
a 0.0
b 1.0
c 2.0
d 3.0
dtype: float64
c 2.0
d 3.0
dtype: float64
0.0
d 3.0
c 2.0
dtype: float64
a 0.0
d 3.0
c 2.0
dtype: float64
a 0.0
c 2.0
dtype: float64
使用标签索引和Python
不同的是, 末尾是包含的
a 0.0
b 1.0
c 2.0
dtype: float64
DataFrame
索引
Texas Utah California a 1.0 NaN 2.0 b 7.0 NaN 8.0 c 4.0 NaN 5.0 d NaN NaN NaN
a 1.0
b 7.0
c 4.0
d NaN
Name: Texas, dtype: float64
Ohio Texas California a 0.0 NaN 2.0 c NaN 4.0 NaN b 6.0 NaN 8.0
Texas 4.0
Utah NaN
California 5.0
Name: c, dtype: float64
Texas 1.0
Utah NaN
California 2.0
Name: a, dtype: float64
1
frame5.ix[['a' ,'c' ],'Texas' ]
a 1.0
c 4.0
Name: Texas, dtype: float64
1
frame5.ix[['a' ,'d' ],['Texas' , 'Ohio' ]]
Texas Ohio a 1.0 NaN d NaN NaN
a 1.0
b 7.0
c 4.0
d NaN
Name: Texas, dtype: float64
1
frame5.ix[frame5.Texas%2 ==1 ,0 ]
a 1.0
b 7.0
Name: Texas, dtype: float64
DataFrame
的索引选项
obj[val]
选取列
obj.ix[val]
选取行
obj.ix[:, val]
选取列
obj.ix[val1, val2]
同时选取行列
reindex
将一个或多个轴匹配到新索引
算数运算和数据对齐 1
2
3
s1 = Series(np.arange(4.0 ), index = ['a' , 'b' ,'c' ,'d' ])
s2 = Series(np.arange(5.0 ), index = ['a' ,'c' ,'e' ,'g' ,'i' ])
s1
a 0.0
b 1.0
c 2.0
d 3.0
dtype: float64
a 0.0
c 1.0
e 2.0
g 3.0
i 4.0
dtype: float64
a 0.0
b NaN
c 3.0
d NaN
e NaN
g NaN
i NaN
dtype: float64
对于DataFrame
对齐会同时发生在行和列上
1
2
df1 = DataFrame(np.arange(9. ).reshape(3 , 3 ), columns=list('bcd' ), index=["A" , "B" , "C" ])
df2 = DataFrame(randn(16 ).reshape(4 ,4 ), columns=list('bdce' ), index=["A" , "D" , "C" , "E" ])
b c d A 0.0 1.0 2.0 B 3.0 4.0 5.0 C 6.0 7.0 8.0
b d c e A 0.303728 -0.716209 -1.532537 -1.234941 D -0.400836 -1.983266 0.017150 -0.678466 C -1.801502 0.125058 1.214294 -0.049006 E 0.764905 -1.760736 -0.494800 0.389546
b c d e A 0.303728 -0.532537 1.283791 NaN B NaN NaN NaN NaN C 4.198498 8.214294 8.125058 NaN D NaN NaN NaN NaN E NaN NaN NaN NaN
算数方法填充空缺值 1
df1.add(df2, fill_value= 0 )
b c d e A 0.303728 -0.532537 1.283791 -1.234941 B 3.000000 4.000000 5.000000 NaN C 4.198498 8.214294 8.125058 -0.049006 D -0.400836 0.017150 -1.983266 -0.678466 E 0.764905 -0.494800 -1.760736 0.389546
1
df2.add(df1, fill_value= 0 )
b c d e A 0.303728 -0.532537 1.283791 -1.234941 B 3.000000 4.000000 5.000000 NaN C 4.198498 8.214294 8.125058 -0.049006 D -0.400836 0.017150 -1.983266 -0.678466 E 0.764905 -0.494800 -1.760736 0.389546
DataFrame
和Series
之间的运算1
2
arr = np.arange(12 ).reshape(3 , 4 )
arr
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
array([[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
b c d A 0.0 1.0 2.0 B 3.0 4.0 5.0 C 6.0 7.0 8.0
b d c e A 0.303728 -0.716209 -1.532537 -1.234941 D -0.400836 -1.983266 0.017150 -0.678466 C -1.801502 0.125058 1.214294 -0.049006 E 0.764905 -1.760736 -0.494800 0.389546
1
2
3
s1 = df1.ix[0 ]
df1 - s1
b c d A 0.0 0.0 0.0 B 3.0 3.0 3.0 C 6.0 6.0 6.0
A 0.0
B 3.0
C 6.0
Name: b, dtype: float64
b c d A 0.0 1.0 2.0 B 0.0 1.0 2.0 C 0.0 1.0 2.0
函数应用和映射 1
2
3
frame = DataFrame(random.randn(4 , 3 ), columns = ['b' , 'd' ,'e' ],
index=['Utah' ,'Ohio' , 'Texas' ,'California' ])
frame
b d e Utah -0.390809 -0.628092 0.973094 Ohio 1.382407 0.610332 -1.573332 Texas 0.407925 0.607034 1.702876 California 0.308539 -0.437947 0.215211
1
f = lambda x: x.max() - x.min()
b 1.773216
d 1.238424
e 3.276208
dtype: float64
在R中得实现方法:1
2
f = function (x) max(x) - min(x)
apply(frame, 2 , f)
Utah 1.601187
Ohio 2.955739
Texas 1.294951
California 0.746486
dtype: float64
在R中得实现方法:
1
2
def f (x) :
return Series([x.min(), x.max()], index = ['min' , 'max' ])
b d e min -0.390809 -0.628092 -1.573332 max 1.382407 0.610332 1.702876
1
2
3
f = function (x) c( max = max(x), min = min(x) )
apply(frame, 1 , f)
apply(frame, 2 , f)
applymap
可以进行元素级的应用函数
1
2
format = lambda x: '%.2f' %x
frame.applymap(format)
b d e Utah -0.39 -0.63 0.97 Ohio 1.38 0.61 -1.57 Texas 0.41 0.61 1.70 California 0.31 -0.44 0.22
对应R
代码:1
2
format <- funtion(x)round(x, digits = 2 )
apply(frame ,c(1 , 2 ), format)
排名和排序 1
2
s1 = Series(randn(5 ), index = list('acdbe' ))
s1
a -1.400520
c -0.396774
d -1.836984
b -0.202243
e 0.657407
dtype: float64
a -1.400520
b -0.202243
c -0.396774
d -1.836984
e 0.657407
dtype: float64
d -1.836984
a -1.400520
c -0.396774
b -0.202243
e 0.657407
dtype: float64
/Users/shihchosen/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:1: FutureWarning: order is deprecated, use sort_values(...)
if __name__ == '__main__':
d -1.836984
a -1.400520
c -0.396774
b -0.202243
e 0.657407
dtype: float64
b d e California 0.308539 -0.437947 0.215211 Ohio 1.382407 0.610332 -1.573332 Texas 0.407925 0.607034 1.702876 Utah -0.390809 -0.628092 0.973094
1
frame.sort_values(by = "b" , ascending=False )
b d e Ohio 1.382407 0.610332 -1.573332 Texas 0.407925 0.607034 1.702876 California 0.308539 -0.437947 0.215211 Utah -0.390809 -0.628092 0.973094
Rank
a -1.400520
c -0.396774
d -1.836984
b -0.202243
e 0.657407
dtype: float64
a 2.0
c 3.0
d 1.0
b 4.0
e 5.0
dtype: float64
如果有数值一样的项时, 会分配一个平均排名, 通过method
参数可以指定哪一个小
1
2
s2 = Series([2 , 4 , 3 , 1 ,4 ])
s2.rank()
0 2.0
1 4.5
2 3.0
3 1.0
4 4.5
dtype: float64
0 2.0
1 4.0
2 3.0
3 1.0
4 5.0
dtype: float64
破坏评级结构的method
选项:
average
默认 平均
min
使用整个分组最小的排名
max
使用整个分组的最大排名
first
按照出现次序排序
对总和计算描述统计
b 1.708061
d 0.151327
e 1.317850
dtype: float64
Utah -0.045807
Ohio 0.419407
Texas 2.717835
California 0.085803
dtype: float64
b d e Utah -0.390809 -0.628092 0.973094 Ohio 0.991598 -0.017760 -0.600237 Texas 1.399522 0.589274 1.102639 California 1.708061 0.151327 1.317850
b d e Utah -0.390809 -1.018901 -0.045807 Ohio 1.382407 1.992739 0.419407 Texas 0.407925 1.014959 2.717835 California 0.308539 -0.129408 0.085803
b d e count 4.000000 4.000000 4.000000 mean 0.427015 0.037832 0.329462 std 0.729386 0.663719 1.406438 min -0.390809 -0.628092 -1.573332 25% 0.133702 -0.485483 -0.231925 50% 0.358232 0.084544 0.594153 75% 0.651545 0.607859 1.155540 max 1.382407 0.610332 1.702876
相关系数与协方差 1
2
3
4
import pandas_datareader.data as web
all_data = {}
for ticker in ['BABA' , 'BIDU' , "JD" ]:
all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2016' , '9/1/2016' )
1
2
volume = DataFrame({tic: data["Volume" ] for tic, data in all_data.items()})
volume.head()
BABA BIDU JD Date 2016-01-04 23066300 4328000 18265300 2016-01-05 14258900 3367900 9426400 2016-01-06 11569300 2280600 12988900 2016-01-07 27288100 7009100 18155700 2016-01-08 20814600 4682800 15164100
1
2
price = DataFrame({tic: data["Adj Close" ] for tic, data in all_data.items()})
price.head()
BABA BIDU JD Date 2016-01-04 76.690002 184.029999 29.530001 2016-01-05 78.629997 187.369995 30.000000 2016-01-06 77.330002 185.610001 29.760000 2016-01-07 72.720001 174.369995 27.950001 2016-01-08 70.800003 171.339996 27.620001
1
2
returns = price.pct_change()
returns.tail()
BABA BIDU JD Date 2016-08-26 0.011922 0.003587 0.014556 2016-08-29 -0.001894 0.001038 -0.006979 2016-08-30 0.011804 0.010366 -0.000781 2016-08-31 0.012396 -0.024964 -0.007034 2016-09-01 0.002366 0.032034 0.011806
1
returns.BABA.corr(returns.JD)
0.63037358921035291
1
returns.BABA.corr(returns.BIDU)
0.49956986052264424
BABA BIDU JD BABA 1.000000 0.499570 0.630374 BIDU 0.499570 1.000000 0.586531 JD 0.630374 0.586531 1.000000
1
returns.corrwith(returns.BABA)
BABA 1.000000
BIDU 0.499570
JD 0.630374
dtype: float64
unique
,count
1
2
obj = Series(['a' , 'c' ,'c' , 'a' , 'b' , 'd' , 'e' ,'A' ])
obj.unique()
array(['a', 'c', 'b', 'd', 'e', 'A'], dtype=object)
a 2
c 2
b 1
A 1
e 1
d 1
dtype: int64
0 False
1 False
2 False
3 False
4 True
5 False
6 False
7 False
dtype: bool
处理确实数据 1
2
string_data = Series(['aardvark' , 'artichoke' , np.nan, 'avocado' ])
string_data
0 aardvark
1 artichoke
2 NaN
3 avocado
dtype: object
0 False
1 False
2 True
3 False
dtype: bool
dropna
fillna
isnull
notnull
1
2
3
from numpy import nan as NA
data = Series([1 , NA, 3.5 , NA, 7 ])
data
0 1.0
1 NaN
2 3.5
3 NaN
4 7.0
dtype: float64
0 1.0
2 3.5
4 7.0
dtype: float64
对DataFrame
使用drop.na
会把所有把包含nan
的行删除
b c d e A 0.303728 -0.532537 1.283791 NaN B NaN NaN NaN NaN C 4.198498 8.214294 8.125058 NaN D NaN NaN NaN NaN E NaN NaN NaN NaN
参数how
只会丢弃全部是na
的行或列
b c d e A 0.303728 -0.532537 1.283791 NaN C 4.198498 8.214294 8.125058 NaN
1
df3.dropna(how = 'all' , axis = 1 )
b c d A 0.303728 -0.532537 1.283791 B NaN NaN NaN C 4.198498 8.214294 8.125058 D NaN NaN NaN E NaN NaN NaN
thresh
参数
1
df3.dropna(how = "all" , thresh=3 )
b c d e A 0.303728 -0.532537 1.283791 NaN C 4.198498 8.214294 8.125058 NaN
补全缺失值
b c d e A 0.303728 -0.532537 1.283791 0.0 B 0.000000 0.000000 0.000000 0.0 C 4.198498 8.214294 8.125058 0.0 D 0.000000 0.000000 0.000000 0.0 E 0.000000 0.000000 0.000000 0.0
1
2
df3.fillna(0 , inplace=True )
df3
b c d e A 0.303728 -0.532537 1.283791 0.0 B 0.000000 0.000000 0.000000 0.0 C 4.198498 8.214294 8.125058 0.0 D 0.000000 0.000000 0.000000 0.0 E 0.000000 0.000000 0.000000 0.0
层次化索引 1
2
3
data = Series(np.random.randn(10 ),
index = [['a' , 'a' , 'a' , 'b' , 'b' , 'b' ,'c' ,'c' ,'c' ,'d' ],
[1 , 2 , 3 ,1 , 2 , 3 ,1 , 2 , 3 ,1 ]])
a 1 0.838350
2 0.191902
3 -0.617628
b 1 -0.738438
2 -0.230139
3 1.577543
c 1 -0.313966
2 -0.682991
3 0.671963
d 1 1.315139
dtype: float64
MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 3], [0, 1, 2, 0, 1, 2, 0, 1, 2, 0]])
1 0.838350
2 0.191902
3 -0.617628
dtype: float64
a 1 0.838350
2 0.191902
3 -0.617628
d 1 1.315139
dtype: float64
a 1 0.838350
2 0.191902
3 -0.617628
b 1 -0.738438
2 -0.230139
3 1.577543
c 1 -0.313966
2 -0.682991
3 0.671963
dtype: float64
a 0.838350
b -0.738438
c -0.313966
d 1.315139
dtype: float64
1
2
data1 = data.unstack()
data1
1 2 3 a 0.838350 0.191902 -0.617628 b -0.738438 -0.230139 1.577543 c -0.313966 -0.682991 0.671963 d 1.315139 NaN NaN
a 1 0.838350
2 0.191902
3 -0.617628
b 1 -0.738438
2 -0.230139
3 1.577543
c 1 -0.313966
2 -0.682991
3 0.671963
d 1 1.315139
dtype: float64
数据加载、存储与文件格式 读取文本格式的数据 1
2
zip_codes = pd.read_csv("/Users/shihchosen/Documents/Intern/lyzs/temp/shiny-examples/063-superzip-example/data/zip_codes_states.csv" )
zip_codes
zip_code latitude longitude city state county 0 501 40.922326 -72.637078 Holtsville NY Suffolk 1 544 40.922326 -72.637078 Holtsville NY Suffolk 2 601 18.165273 -66.722583 Adjuntas PR Adjuntas 3 602 18.393103 -67.180953 Aguada PR Aguada 4 603 18.455913 -67.145780 Aguadilla PR Aguadilla 5 604 18.493520 -67.135883 Aguadilla PR Aguadilla 6 605 18.465162 -67.141486 Aguadilla PR Aguadilla 7 606 18.172947 -66.944111 Maricao PR Maricao 8 610 18.288685 -67.139696 Anasco PR Anasco 9 611 18.279531 -66.802170 Angeles PR Utuado 10 612 18.450674 -66.698262 Arecibo PR Arecibo 11 613 18.458093 -66.732732 Arecibo PR Arecibo 12 614 18.429675 -66.674506 Arecibo PR Arecibo 13 616 18.444792 -66.640678 Bajadero PR Arecibo 14 617 18.447092 -66.544255 Barceloneta PR Barceloneta 15 622 17.998531 -67.187318 Boqueron PR Cabo Rojo 16 623 18.062201 -67.149541 Cabo Rojo PR Cabo Rojo 17 624 18.023535 -66.726156 Penuelas PR Penuelas 18 627 18.477891 -66.854770 Camuy PR Camuy 19 631 18.269187 -66.864993 Castaner PR Lares 20 636 18.113284 -67.039706 Rosario PR San German 21 637 18.087322 -66.934911 Sabana Grande PR Sabana Grande 22 638 18.336160 -66.472087 Ciales PR Ciales 23 641 18.250027 -66.698957 Utuado PR Utuado 24 646 18.436060 -66.281954 Dorado PR Dorado 25 647 17.969594 -66.939754 Ensenada PR Guanica 26 650 18.360125 -66.562311 Florida PR Florida 27 652 18.457254 -66.603358 Garrochales PR Arecibo 28 653 17.972468 -66.898661 Guanica PR Guanica 29 656 18.023280 -66.786909 Guayanilla PR Guayanilla … … … … … … … 42711 99803 58.377074 -134.617383 Juneau AK Juneau 42712 99811 58.383480 -134.197786 Juneau AK Juneau 42713 99820 57.500525 -134.585954 Angoon AK Skagway Hoonah Angoon 42714 99821 58.449413 -134.700348 Auke Bay AK Juneau 42715 99824 58.276308 -134.397642 Douglas AK Juneau 42716 99825 58.331798 -135.181256 Elfin Cove AK Skagway Hoonah Angoon 42717 99826 58.331798 -135.181256 Gustavus AK Skagway Hoonah Angoon 42718 99827 59.312688 -135.681153 Haines AK Haines 42719 99829 58.097655 -135.431561 Hoonah AK Skagway Hoonah Angoon 42720 99830 56.973554 -133.935996 Kake AK Wrangell Petersburg 42721 99832 58.331798 -135.181256 Pelican AK Skagway Hoonah Angoon 42722 99833 56.767056 -132.931950 Petersburg AK Wrangell Petersburg 42723 99835 57.080818 -135.318348 Sitka AK Sitka 42724 99836 56.535950 -132.827948 Port Alexander AK Wrangell Petersburg 42725 99840 59.456753 -135.314852 Skagway AK Skagway Hoonah Angoon 42726 99841 57.798249 -135.285061 Tenakee Springs AK Skagway Hoonah Angoon 42727 99850 58.383480 -134.197786 Juneau AK Juneau 42728 99901 55.400674 -131.674090 Ketchikan AK Ketchikan Gateway 42729 99903 55.517921 -132.003244 Meyers Chuck AK Prince Wales Ketchikan 42730 99918 55.517921 -132.003244 Coffman Cove AK Prince Wales Ketchikan 42731 99919 55.126774 -131.580407 Thorne Bay AK Prince Wales Ketchikan 42732 99921 55.513301 -133.117022 Craig AK Prince Wales Ketchikan 42733 99922 55.208706 -132.825903 Hydaburg AK Prince Wales Ketchikan 42734 99923 55.517921 -132.003244 Hyder AK Prince Wales Ketchikan 42735 99925 55.554393 -133.090680 Klawock AK Prince Wales Ketchikan 42736 99926 55.094325 -131.566827 Metlakatla AK Prince Wales Ketchikan 42737 99927 55.517921 -132.003244 Point Baker AK Prince Wales Ketchikan 42738 99928 55.395359 -131.675370 Ward Cove AK Ketchikan Gateway 42739 99929 56.449893 -132.364407 Wrangell AK Wrangell Petersburg 42740 99950 55.542007 -131.432682 Ketchikan AK Ketchikan Gateway
42741 rows × 6 columns
1
names = ['zip_code' , 'latitude' , 'longitude' , 'city' , 'state' , 'county' ]
1
2
pd.read_csv("/Users/shihchosen/Documents/Intern/lyzs/temp/shiny-examples/063-superzip-example/data/zip_codes_states.csv" ,
sep="," , names= names, index_col = ['zip_code' ,'city' ])
latitude longitude state county zip_code city zip_code city latitude longitude state county 00501 Holtsville 40.922326 -72.637078 NY Suffolk 00544 Holtsville 40.922326 -72.637078 NY Suffolk 00601 Adjuntas 18.165273 -66.722583 PR Adjuntas 00602 Aguada 18.393103 -67.180953 PR Aguada 00603 Aguadilla 18.455913 -67.14578 PR Aguadilla 00604 Aguadilla 18.49352 -67.135883 PR Aguadilla 00605 Aguadilla 18.465162 -67.141486 PR Aguadilla 00606 Maricao 18.172947 -66.944111 PR Maricao 00610 Anasco 18.288685 -67.139696 PR Anasco 00611 Angeles 18.279531 -66.80217 PR Utuado 00612 Arecibo 18.450674 -66.698262 PR Arecibo 00613 Arecibo 18.458093 -66.732732 PR Arecibo 00614 Arecibo 18.429675 -66.674506 PR Arecibo 00616 Bajadero 18.444792 -66.640678 PR Arecibo 00617 Barceloneta 18.447092 -66.544255 PR Barceloneta 00622 Boqueron 17.998531 -67.187318 PR Cabo Rojo 00623 Cabo Rojo 18.062201 -67.149541 PR Cabo Rojo 00624 Penuelas 18.023535 -66.726156 PR Penuelas 00627 Camuy 18.477891 -66.85477 PR Camuy 00631 Castaner 18.269187 -66.864993 PR Lares 00636 Rosario 18.113284 -67.039706 PR San German 00637 Sabana Grande 18.087322 -66.934911 PR Sabana Grande 00638 Ciales 18.33616 -66.472087 PR Ciales 00641 Utuado 18.250027 -66.698957 PR Utuado 00646 Dorado 18.43606 -66.281954 PR Dorado 00647 Ensenada 17.969594 -66.939754 PR Guanica 00650 Florida 18.360125 -66.562311 PR Florida 00652 Garrochales 18.457254 -66.603358 PR Arecibo 00653 Guanica 17.972468 -66.898661 PR Guanica … … … … … … 99803 Juneau 58.377074 -134.617383 AK Juneau 99811 Juneau 58.38348 -134.197786 AK Juneau 99820 Angoon 57.500525 -134.585954 AK Skagway Hoonah Angoon 99821 Auke Bay 58.449413 -134.700348 AK Juneau 99824 Douglas 58.276308 -134.397642 AK Juneau 99825 Elfin Cove 58.331798 -135.181256 AK Skagway Hoonah Angoon 99826 Gustavus 58.331798 -135.181256 AK Skagway Hoonah Angoon 99827 Haines 59.312688 -135.681153 AK Haines 99829 Hoonah 58.097655 -135.431561 AK Skagway Hoonah Angoon 99830 Kake 56.973554 -133.935996 AK Wrangell Petersburg 99832 Pelican 58.331798 -135.181256 AK Skagway Hoonah Angoon 99833 Petersburg 56.767056 -132.931950 AK Wrangell Petersburg 99835 Sitka 57.080818 -135.318348 AK Sitka 99836 Port Alexander 56.53595 -132.827948 AK Wrangell Petersburg 99840 Skagway 59.456753 -135.314852 AK Skagway Hoonah Angoon 99841 Tenakee Springs 57.798249 -135.285061 AK Skagway Hoonah Angoon 99850 Juneau 58.38348 -134.197786 AK Juneau 99901 Ketchikan 55.400674 -131.674090 AK Ketchikan Gateway 99903 Meyers Chuck 55.517921 -132.003244 AK Prince Wales Ketchikan 99918 Coffman Cove 55.517921 -132.003244 AK Prince Wales Ketchikan 99919 Thorne Bay 55.126774 -131.580407 AK Prince Wales Ketchikan 99921 Craig 55.513301 -133.117022 AK Prince Wales Ketchikan 99922 Hydaburg 55.208706 -132.825903 AK Prince Wales Ketchikan 99923 Hyder 55.517921 -132.003244 AK Prince Wales Ketchikan 99925 Klawock 55.554393 -133.090680 AK Prince Wales Ketchikan 99926 Metlakatla 55.094325 -131.566827 AK Prince Wales Ketchikan 99927 Point Baker 55.517921 -132.003244 AK Prince Wales Ketchikan 99928 Ward Cove 55.395359 -131.675370 AK Ketchikan Gateway 99929 Wrangell 56.449893 -132.364407 AK Wrangell Petersburg 99950 Ketchikan 55.542007 -131.432682 AK Ketchikan Gateway
42742 rows × 4 columns
逐块读取文本文件 1
2
pd.read_csv("/Users/shihchosen/Documents/Intern/lyzs/temp/shiny-examples/063-superzip-example/data/zip_codes_states.csv" ,
nrows = 5 )
zip_code latitude longitude city state county 0 501 40.922326 -72.637078 Holtsville NY Suffolk 1 544 40.922326 -72.637078 Holtsville NY Suffolk 2 601 18.165273 -66.722583 Adjuntas PR Adjuntas 3 602 18.393103 -67.180953 Aguada PR Aguada 4 603 18.455913 -67.145780 Aguadilla PR Aguadilla
将数据写出到文本格式 1
zip_codes.to_csv("/Users/shihchosen/Downloads/zip_codes.csv" )
1
2
data = pd.read_csv('ch06/ex5.csv' )
data
something a b c d message 0 one 1 2 3.0 4 NaN 1 two 5 6 NaN 8 world 2 three 9 10 11.0 12 foo
1
data.to_csv(sys.stdout, na_rep="NULL" )
,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo
JSON
数据1
2
3
import json
f = open('ch06/nations.json' )
result = json.load(f)
[{'income': [[1800, 359.93],
[1820, 359.93],
[1913, 556.12],
[1950, 3363.02],
[1951, 3440.9],
[1952, 3520.61],
[1953, 3598.81],
[1954, 3450.82],
[1955, 3672.08],
[1956, 3549.04],
[1957, 3827.94],
[1958, 3966.42],
[1959, 3917.76],
[1960, 4006.21],
[1961, 4463.83],
[1962, 4269.28],
[1963, 4413.6],
[1964, 4826.49],
[1965, 5102.21],
[1966, 5308.14],
[1967, 5522.78],
[1968, 5346.63],
[1969, 5408.12],
[1970, 5651.88],
[1971, 5526.21],
[1972, 5473.29],
[1973, 5722.02],
[1974, 5470.21],
[1975, 3430.85],
[1976, 3050.32],
[1977, 3008.65],
[1978, 3070.82],
[1979, 3064.89],
[1980, 3074.75],
[1981, 2953.41],
[1982, 2756.95],
[1983, 2584.56],
[1984, 2527.47],
[1985, 2492.83],
[1986, 2220.61],
[1987, 2430.21],
[1988, 2728.53],
[1989, 2730.56],
[1990, 2777.42],
[1991, 2730.85],
[1992, 2627.85],
[1993, 1869.92],
[1994, 1851.45],
[1995, 1989.02],
[1996, 2157.35],
[1997, 2277.14],
[1998, 2384.48],
[1999, 2417.18],
[2000, 2446.65],
[2001, 2479.69],
[2002, 2773.29],
[2003, 2785.39],
[2004, 3007.11],
[2005, 3533],
[2006, 4069.56],
[2007, 4755.46],
[2008, 5228.74],
[2009, 5055.59]],
'lifeExpectancy': [[1800, 26.98],
[1940, 26.98],
[1950, 29.22],
[1951, 29.42],
[1952, 29.81],
[1953, 30.21],
[1954, 30.6],
[1955, 31],
[1956, 31.4],
[1957, 31.8],
[1958, 32.2],
[1959, 32.6],
[1960, 33],
[1961, 33.4],
[1962, 33.8],
[1963, 34.2],
[1964, 34.6],
[1965, 35],
[1966, 35.4],
[1967, 35.8],
[1968, 36.2],
[1969, 36.6],
[1970, 37],
[1971, 37.41],
[1972, 37.83],
[1973, 38.26],
[1974, 38.68],
[1975, 39.09],
[1976, 39.46],
[1977, 39.8],
[1978, 40.1],
[1979, 40.34],
[1980, 40.55],
[1981, 40.71],
[1982, 40.85],
[1983, 40.97],
[1984, 41.08],
[1985, 41.2],
[1986, 41.33],
[1987, 41.48],
[1988, 41.64],
[1989, 41.81],
[1990, 41.99],
[1991, 42.16],
[1992, 42.32],
[1993, 42.46],
[1994, 42.59],
[1995, 42.7],
[1996, 42.82],
[1997, 42.96],
[1998, 43.12],
[1999, 43.32],
[2000, 43.56],
[2001, 43.86],
[2002, 44.22],
[2003, 44.61],
[2004, 45.05],
[2005, 45.52],
[2006, 46.02],
[2007, 46.54],
[2008, 47.06],
[2009, 47.58]],
'name': 'Angola',
'population': [[1800, 1567028],
[1820, 1567028],
[1940, 3738000],
[1950, 4117617],
[1951, 4173095],
[1952, 4232095],
[1953, 4293840],
[1954, 4357527],
[1955, 4423223],
[1956, 4490992],
[1957, 4561361],
[1958, 4635885],
[1959, 4714676],
[1960, 4797344],
[1961, 4752489],
[1962, 4826015],
[1963, 4919586],
[1964, 5026044],
[1965, 5134818],
[1966, 5201250],
[1967, 5247469],
[1968, 5350384],
[1969, 5471641],
[1970, 5605626],
[1971, 5752775],
[1972, 5894858],
[1973, 6025841],
[1974, 5986432],
[1975, 5884241],
[1976, 5942225],
[1977, 6162675],
[1978, 6285716],
[1979, 6451227],
[1980, 6741465],
[1981, 6877697],
[1982, 7016384],
[1983, 7238214],
[1984, 7439658],
[1985, 7581504],
[1986, 7744932],
[1987, 7874230],
[1988, 8018724],
[1989, 8148595],
[1990, 8290856],
[1991, 8490763],
[1992, 8735988],
[1993, 8961438],
[1994, 9170032],
[1995, 9421477],
[1996, 9660081],
[1997, 9875024],
[1998, 10071442],
[1999, 10263229],
[2000, 10442812],
[2001, 10623424],
[2002, 10866106],
[2003, 11186202],
[2004, 11521432],
[2005, 11827315],
[2006, 12127071],
...
[1960, 1117376],
[1961, 1131821],
[1962, 1146757],
[1963, 1162199],
[1964, 1178277],
[1965, 1195012],
[1966, 1212426],
[1967, 1230542],
[1968, 1249383],
[1969, 1269000],
[1970, 1289420],
[1971, 1310672],
[1972, 1332786],
[1973, 1355793],
[1974, 1379655],
[1975, 1404403],
[1976, 1430069],
[1977, 1456688],
[1978, 1485467],
[1979, 1516462],
[1980, 1549589],
[1981, 1584814],
[1982, 1622136],
[1983, 1661573],
[1984, 1703154],
[1985, 1746923],
[1986, 1792933],
[1987, 1841240],
[1988, 1891894],
[1989, 1937201],
[1990, 1984449],
[1991, 2041306],
[1992, 2119465],
[1993, 2205240],
[1994, 2279436],
[1995, 2341749],
[1996, 2388865],
[1997, 2444741],
[1998, 2515309],
[1999, 2590516],
[2000, 2667859],
[2001, 2747312],
[2002, 2828858],
[2003, 2912584],
[2004, 2998563],
...
[1961, 7367.29],
[1962, 7133.17],
[1963, 6852.54],
[1964, 7452.89],
[1965, 8022.44],
[1991, 68639527],
...
[1996, 66.15],
[1997, 66.44],
[1998, 66.73],
[1999, 67.06],
[2000, 67.42],
[2001, 67.8],
[2002, 68.18],
[2003, 68.56],
[2004, 68.92],
[2005, 69.26],
[2006, 69.58],
[2007, 69.89],
[2008, 70.2],
[2009, 70.5]],
'name': 'Vanuatu',
'population': [[1800, 27791],
[1820, 27791],
[1950, 52000],
[1951, 53262],
[1952, 54554],
...
[1998, 182882],
[1999, 186282],
[2000, 189618],
[2001, 192910],
[2002, 196178],
[2003, 199414],
[2004, 202609],
[2005, 205754],
[2006, 208869],
[2007, 211971],
[2008, 215053]],
'region': 'East Asia & Pacific'}]
1
income = result[1 ]['income' ]
1
2
asjson = json.dumps(income)
income_df = DataFrame(income, columns=["Year" ,"Income" ])
Year Income 0 1800 553.72 1 1820 553.72 2 1913 855.53 3 1950 1104.47 4 1951 1083.57 5 1952 1062.75 6 1953 1012.84 7 1954 1021.29 8 1955 1000.66 9 1956 980.06 10 1957 959.60 11 1958 974.04 12 1959 988.88 13 1960 996.93 14 1961 1006.39 15 1962 949.50 16 1963 970.34 17 1964 1012.25 18 1965 1039.34 19 1966 1051.67 20 1967 1035.83 21 1968 1048.78 22 1969 1052.80 23 1970 1047.17 24 1971 1024.93 25 1972 1085.80 26 1973 1081.87 27 1974 973.64 28 1975 987.96 29 1976 1002.19 … … … 33 1980 1154.23 34 1981 1186.70 35 1982 1277.90 36 1983 1187.14 37 1984 1243.82 38 1985 1297.16 39 1986 1285.08 40 1987 1225.86 41 1988 1222.90 42 1989 1158.34 43 1990 1165.47 44 1991 1181.65 45 1992 1191.21 46 1993 1177.91 47 1994 1168.53 48 1995 1176.67 49 1996 1202.15 50 1997 1232.98 51 1998 1251.88 52 1999 1272.80 53 2000 1307.57 54 2001 1333.30 55 2002 1372.88 56 2003 1386.05 57 2004 1389.13 58 2005 1390.00 59 2006 1402.94 60 2007 1428.15 61 2008 1459.07 62 2009 1457.57
63 rows × 2 columns