Pandas 기초
import pandas as pd
multiple index 생성
data = pd.Series([10, 20, 30, 40, 15, 25, 35, 25],
index = [['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'], ['obj1', 'obj2', 'obj3', 'obj4', 'obj1', 'obj2', 'obj3', 'obj4']])
data
a obj1 10
obj2 20
obj3 30
obj4 40
b obj1 15
obj2 25
obj3 35
obj4 25
dtype: int64
data.index
MultiIndex([('a', 'obj1'),
('a', 'obj2'),
('a', 'obj3'),
('a', 'obj4'),
('b', 'obj1'),
('b', 'obj2'),
('b', 'obj3'),
('b', 'obj4')],
)
Partial indexing
data['b']
obj1 15
obj2 25
obj3 35
obj4 25
dtype: int64
data[:, 'obj2']
a 20
b 25
dtype: int64
데이터 unstack
data.unstack(0)
a | b | |
---|---|---|
obj1 | 10 | 15 |
obj2 | 20 | 25 |
obj3 | 30 | 35 |
obj4 | 40 | 25 |
data.unstack(1)
obj1 | obj2 | obj3 | obj4 | |
---|---|---|---|---|
a | 10 | 20 | 30 | 40 |
b | 15 | 25 | 35 | 25 |
d = data.unstack()
d
obj1 | obj2 | obj3 | obj4 | |
---|---|---|---|---|
a | 10 | 20 | 30 | 40 |
b | 15 | 25 | 35 | 25 |
d.stack()
a obj1 10
obj2 20
obj3 30
obj4 40
b obj1 15
obj2 25
obj3 35
obj4 25
dtype: int64
column indexing
import numpy as np
df = pd.DataFrame(np.arange(12).reshape(4, 3),
index = [['a', 'a', 'b', 'b'], ['one', 'two', 'three', 'four']],
columns = [['num1', 'num2', 'num3'], ['red', 'green', 'red']]
)
df
num1 | num2 | num3 | ||
---|---|---|---|---|
red | green | red | ||
a | one | 0 | 1 | 2 |
two | 3 | 4 | 5 | |
b | three | 6 | 7 | 8 |
four | 9 | 10 | 11 |
df.index
MultiIndex([('a', 'one'),
('a', 'two'),
('b', 'three'),
('b', 'four')],
)
df.columns
MultiIndex([('num1', 'red'),
('num2', 'green'),
('num3', 'red')],
)
df.index.names=['key1', 'key2']
df.columns.names=['n', 'color']
df
n | num1 | num2 | num3 | |
---|---|---|---|---|
color | red | green | red | |
key1 | key2 | |||
a | one | 0 | 1 | 2 |
two | 3 | 4 | 5 | |
b | three | 6 | 7 | 8 |
four | 9 | 10 | 11 |
df['num1']
color | red | |
---|---|---|
key1 | key2 | |
a | one | 0 |
two | 3 | |
b | three | 6 |
four | 9 |
df.ix['a']
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-48-470835279a79> in <module>
----> 1 df.ix['a']
~/opt/anaconda3/envs/playdata/lib/python3.9/site-packages/pandas/core/generic.py in __getattr__(self, name)
5463 if self._info_axis._can_hold_identifiers_and_holds_name(name):
5464 return self[name]
-> 5465 return object.__getattribute__(self, name)
5466
5467 def __setattr__(self, name: str, value) -> None:
AttributeError: 'DataFrame' object has no attribute 'ix'
swaplevel
df.swaplevel('key1', 'key2')
n | num1 | num2 | num3 | |
---|---|---|---|---|
color | red | green | red | |
key2 | key1 | |||
one | a | 0 | 1 | 2 |
two | a | 3 | 4 | 5 |
three | b | 6 | 7 | 8 |
four | b | 9 | 10 | 11 |
df.sort_index(level='key2')
n | num1 | num2 | num3 | |
---|---|---|---|---|
color | red | green | red | |
key1 | key2 | |||
b | four | 9 | 10 | 11 |
a | one | 0 | 1 | 2 |
b | three | 6 | 7 | 8 |
a | two | 3 | 4 | 5 |
level을 사용하여 상태 요약
df.sum(level = 'key1')
n | num1 | num2 | num3 |
---|---|---|---|
color | red | green | red |
key1 | |||
a | 3 | 5 | 7 |
b | 15 | 17 | 19 |
df.sum(level='color', axis=1)
color | red | green | |
---|---|---|---|
key1 | key2 | ||
a | one | 2 | 1 |
two | 8 | 4 | |
b | three | 14 | 7 |
four | 20 | 10 |
File operations
파일 읽기
import pandas as pd
df = pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/readEx/ex1.csv',
index_col=None, encoding='utf-8')
df
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
df = pd.read_table('./pythondsp-pandasguide-b936c3b43406/data/readEx/ex1.csv',
sep=',', index_col=None, encoding='utf-8')
df
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
- header
df = pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/readEx/ex2.csv',
index_col=None, encoding='utf-8', header=None)
df
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
df = pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/readEx/ex2.csv',
index_col=None, encoding='utf-8', names=['a', 'b', 'c', 'd', 'message'])
df
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
df = pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/readEx/ex2.csv',
encoding='utf-8', names=['a', 'b', 'c', 'd', 'message'],
index_col='message')
df
a | b | c | d | |
---|---|---|---|---|
message | ||||
hello | 1 | 2 | 3 | 4 |
world | 5 | 6 | 7 | 8 |
foo | 9 | 10 | 11 | 12 |
pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/readEx/csv_mindex.csv',
index_col=['key1', 'key2'], encoding='utf-8')
value1 | value2 | ||
---|---|---|---|
key1 | key2 | ||
one | a | 1 | 2 |
b | 3 | 4 | |
c | 5 | 6 | |
d | 7 | 8 | |
two | a | 9 | 10 |
b | 11 | 12 | |
c | 13 | 14 | |
d | 15 | 16 |
- skiprows
d = pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/readEx/ex4.csv',
index_col=None, encoding='utf-8')
d
# hey! | ||||
---|---|---|---|---|
a | b | c | d | message |
# just wanted to make things more difficult for you | NaN | NaN | NaN | NaN |
# who reads CSV files with computers | anyway? | NaN | NaN | NaN |
1 | 2 | 3 | 4 | hello |
5 | 6 | 7 | 8 | world |
9 | 10 | 11 | 12 | foo |
d = pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/readEx/ex4.csv',
index_col=None, encoding='utf-8', skiprows=[0, 2, 3])
d
a | b | c | d | message | |
---|---|---|---|---|---|
0 | 1 | 2 | 3 | 4 | hello |
1 | 5 | 6 | 7 | 8 | world |
2 | 9 | 10 | 11 | 12 | foo |
파일에 데이터 쓰기
d.to_csv('d_out.csv')
d.to_csv('d_out2.csv', header=False, index=False)
d1 = pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/readEx/d_out.csv',
index_col=None, encoding='utf-8')
d1
Unnamed: 0 | a | b | c | d | message | |
---|---|---|---|---|---|---|
0 | 0 | 1 | 2 | 3 | 4 | hello |
1 | 1 | 5 | 6 | 7 | 8 | world |
2 | 2 | 9 | 10 | 11 | 12 | foo |
d2 = pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/readEx/d_out2.csv',
index_col=None, encoding='utf-8')
d2
0 | 1 | 2 | 3 | 4 | hello | |
---|---|---|---|---|---|---|
0 | 1 | 5 | 6 | 7 | 8 | world |
1 | 2 | 9 | 10 | 11 | 12 | foo |
Merge
df1 = pd.DataFrame({'key' : ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : range(7)})
df1
key | data1 | |
---|---|---|
0 | b | 0 |
1 | b | 1 |
2 | a | 2 |
3 | c | 3 |
4 | a | 4 |
5 | a | 5 |
6 | b | 6 |
df2 = pd.DataFrame({'key' : ['a', 'b', 'd', 'b'],
'data2' : range(4)})
df2
key | data2 | |
---|---|---|
0 | a | 0 |
1 | b | 1 |
2 | d | 2 |
3 | b | 3 |
pd.merge(df1, df2)
key | data1 | data2 | |
---|---|---|---|
0 | b | 0 | 1 |
1 | b | 0 | 3 |
2 | b | 1 | 1 |
3 | b | 1 | 3 |
4 | b | 6 | 1 |
5 | b | 6 | 3 |
6 | a | 2 | 0 |
7 | a | 4 | 0 |
8 | a | 5 | 0 |
df1 = pd.DataFrame({'key1' : ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : range(7)})
df2 = pd.DataFrame({'key2' : ['a', 'b', 'd', 'b'],
'data2' : range(4)})
pd.merge(df1, df2, left_on='key1', right_on='key2')
key1 | data1 | key2 | data2 | |
---|---|---|---|---|
0 | b | 0 | b | 1 |
1 | b | 0 | b | 3 |
2 | b | 1 | b | 1 |
3 | b | 1 | b | 3 |
4 | b | 6 | b | 1 |
5 | b | 6 | b | 3 |
6 | a | 2 | a | 0 |
7 | a | 4 | a | 0 |
8 | a | 5 | a | 0 |
inner / outer join
pd.merge(df1, df2, left_on='key1', right_on='key2', how='left')
key1 | data1 | key2 | data2 | |
---|---|---|---|---|
0 | b | 0 | b | 1.0 |
1 | b | 0 | b | 3.0 |
2 | b | 1 | b | 1.0 |
3 | b | 1 | b | 3.0 |
4 | a | 2 | a | 0.0 |
5 | c | 3 | NaN | NaN |
6 | a | 4 | a | 0.0 |
7 | a | 5 | a | 0.0 |
8 | b | 6 | b | 1.0 |
9 | b | 6 | b | 3.0 |
pd.merge(df1, df2, left_on='key1', right_on='key2', how='right')
key1 | data1 | key2 | data2 | |
---|---|---|---|---|
0 | a | 2.0 | a | 0 |
1 | a | 4.0 | a | 0 |
2 | a | 5.0 | a | 0 |
3 | b | 0.0 | b | 1 |
4 | b | 1.0 | b | 1 |
5 | b | 6.0 | b | 1 |
6 | NaN | NaN | d | 2 |
7 | b | 0.0 | b | 3 |
8 | b | 1.0 | b | 3 |
9 | b | 6.0 | b | 3 |
pd.merge(df1, df2, left_on='key1', right_on='key2', how='outer')
key1 | data1 | key2 | data2 | |
---|---|---|---|---|
0 | b | 0.0 | b | 1.0 |
1 | b | 0.0 | b | 3.0 |
2 | b | 1.0 | b | 1.0 |
3 | b | 1.0 | b | 3.0 |
4 | b | 6.0 | b | 1.0 |
5 | b | 6.0 | b | 3.0 |
6 | a | 2.0 | a | 0.0 |
7 | a | 4.0 | a | 0.0 |
8 | a | 5.0 | a | 0.0 |
9 | c | 3.0 | NaN | NaN |
10 | NaN | NaN | d | 2.0 |
Concatenating data
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 1, 3], index=['c', 'd', 'e'])
s3 = pd.Series([4, 7], index=['a', 'e'])
s1
a 0
b 1
dtype: int64
s2
c 2
d 1
e 3
dtype: int64
s3
a 4
e 7
dtype: int64
pd.concat([s1, s2])
a 0
b 1
c 2
d 1
e 3
dtype: int64
pd.concat([s1, s2], axis=1)
0 | 1 | |
---|---|---|
a | 0.0 | NaN |
b | 1.0 | NaN |
c | NaN | 2.0 |
d | NaN | 1.0 |
e | NaN | 3.0 |
pd.concat([s1, s2, s3], keys=['one', 'two', 'three'])
one a 0
b 1
two c 2
d 1
e 3
three a 4
e 7
dtype: int64
pd.concat([df1, df2], join='inner', axis=1, keys=['one', 'two'])
one | two | |||
---|---|---|---|---|
key1 | data1 | key2 | data2 | |
0 | b | 0 | a | 0 |
1 | b | 1 | b | 1 |
2 | a | 2 | d | 2 |
3 | c | 3 | b | 3 |
pd.concat({'level1':df1, 'level2':df2}, axis=1, join='inner')
level1 | level2 | |||
---|---|---|---|---|
key1 | data1 | key2 | data2 | |
0 | b | 0 | a | 0 |
1 | b | 1 | b | 1 |
2 | a | 2 | d | 2 |
3 | c | 3 | b | 3 |
데이터 변환
중복항목 제거
df = pd.DataFrame({'k1':['one']*3+['two']*4,
'k2':[1,1,2,3,3,4,4]})
df
k1 | k2 | |
---|---|---|
0 | one | 1 |
1 | one | 1 |
2 | one | 2 |
3 | two | 3 |
4 | two | 3 |
5 | two | 4 |
6 | two | 4 |
df.duplicated()
0 False
1 True
2 False
3 False
4 True
5 False
6 True
dtype: bool
df.drop_duplicates()
k1 | k2 | |
---|---|---|
0 | one | 1 |
2 | one | 2 |
3 | two | 3 |
5 | two | 4 |
df.drop_duplicates(keep='last')
k1 | k2 | |
---|---|---|
1 | one | 1 |
2 | one | 2 |
4 | two | 3 |
6 | two | 4 |
df.drop_duplicates(['k1'])
k1 | k2 | |
---|---|---|
0 | one | 1 |
3 | two | 3 |
df.drop_duplicates(['k1', 'k2'])
k1 | k2 | |
---|---|---|
0 | one | 1 |
2 | one | 2 |
3 | two | 3 |
5 | two | 4 |
value 대체
df.replace('one', 'One')
k1 | k2 | |
---|---|---|
0 | One | 1 |
1 | One | 1 |
2 | One | 2 |
3 | two | 3 |
4 | two | 3 |
5 | two | 4 |
6 | two | 4 |
df.replace(['one', 3], ['One', 30])
k1 | k2 | |
---|---|---|
0 | One | 1 |
1 | One | 1 |
2 | One | 2 |
3 | two | 30 |
4 | two | 30 |
5 | two | 4 |
6 | two | 4 |
groupby 와 데이터 집합
기초
df = pd.DataFrame({'k1':['a', 'a', 'b', 'b', 'a'],
'k2':['one', 'two', 'one', 'two', 'one'],
'data1':[2, 3, 3, 2, 4],
'data2':[5, 5, 5, 5, 10]})
df
k1 | k2 | data1 | data2 | |
---|---|---|---|---|
0 | a | one | 2 | 5 |
1 | a | two | 3 | 5 |
2 | b | one | 3 | 5 |
3 | b | two | 2 | 5 |
4 | a | one | 4 | 10 |
gp1 = df['data1'].groupby(df['k1'])
gp1
<pandas.core.groupby.generic.SeriesGroupBy object at 0x7ff3d800a910>
gp1.mean()
k1
a 3.0
b 2.5
Name: data1, dtype: float64
gp2 = df['data1'].groupby([df['k1'], df['k2']])
mean = gp2.mean()
mean
k1 k2
a one 3
two 3
b one 3
two 2
Name: data1, dtype: int64
그룹 반복
for name, group in gp1:
print(name)
print(group)
a
0 2
1 3
4 4
Name: data1, dtype: int64
b
2 3
3 2
Name: data1, dtype: int64
for name, group in gp2:
print(name)
print(group)
('a', 'one')
0 2
4 4
Name: data1, dtype: int64
('a', 'two')
1 3
Name: data1, dtype: int64
('b', 'one')
2 3
Name: data1, dtype: int64
('b', 'two')
3 2
Name: data1, dtype: int64
for (k1, k2), group in gp2:
print(k1, k2)
print(group)
a one
0 2
4 4
Name: data1, dtype: int64
a two
1 3
Name: data1, dtype: int64
b one
2 3
Name: data1, dtype: int64
b two
3 2
Name: data1, dtype: int64
Data 집합
gp1.max()
k1
a 4
b 3
Name: data1, dtype: int64
gp2.min()
k1 k2
a one 2
two 3
b one 3
two 2
Name: data1, dtype: int64
dates 와 times
time 생성
import pandas as pd
import numpy as np
rng = pd.date_range('2011-03-01 10:15', periods = 10, freq = 'M')
rng
DatetimeIndex(['2011-03-31 10:15:00', '2011-04-30 10:15:00',
'2011-05-31 10:15:00', '2011-06-30 10:15:00',
'2011-07-31 10:15:00', '2011-08-31 10:15:00',
'2011-09-30 10:15:00', '2011-10-31 10:15:00',
'2011-11-30 10:15:00', '2011-12-31 10:15:00'],
dtype='datetime64[ns]', freq='M')
rng = pd.date_range('2015 Jul 2 10:15', periods = 10, freq = 'M')
rng
DatetimeIndex(['2015-07-31 10:15:00', '2015-08-31 10:15:00',
'2015-09-30 10:15:00', '2015-10-31 10:15:00',
'2015-11-30 10:15:00', '2015-12-31 10:15:00',
'2016-01-31 10:15:00', '2016-02-29 10:15:00',
'2016-03-31 10:15:00', '2016-04-30 10:15:00'],
dtype='datetime64[ns]', freq='M')
- start 와 end 를 통해 생성
rng = pd.date_range(start='2015 Jul 2 10:15', end='2015 July 12', freq='12H')
rng
DatetimeIndex(['2015-07-02 10:15:00', '2015-07-02 22:15:00',
'2015-07-03 10:15:00', '2015-07-03 22:15:00',
'2015-07-04 10:15:00', '2015-07-04 22:15:00',
'2015-07-05 10:15:00', '2015-07-05 22:15:00',
'2015-07-06 10:15:00', '2015-07-06 22:15:00',
'2015-07-07 10:15:00', '2015-07-07 22:15:00',
'2015-07-08 10:15:00', '2015-07-08 22:15:00',
'2015-07-09 10:15:00', '2015-07-09 22:15:00',
'2015-07-10 10:15:00', '2015-07-10 22:15:00',
'2015-07-11 10:15:00', '2015-07-11 22:15:00'],
dtype='datetime64[ns]', freq='12H')
len(rng)
20
rng = pd.date_range(start='2015 Jul 2 10:15', end='2015 July 12', freq='12H', tz='Asia/Kolkata')
rng
DatetimeIndex(['2015-07-02 10:15:00+05:30', '2015-07-02 22:15:00+05:30',
'2015-07-03 10:15:00+05:30', '2015-07-03 22:15:00+05:30',
'2015-07-04 10:15:00+05:30', '2015-07-04 22:15:00+05:30',
'2015-07-05 10:15:00+05:30', '2015-07-05 22:15:00+05:30',
'2015-07-06 10:15:00+05:30', '2015-07-06 22:15:00+05:30',
'2015-07-07 10:15:00+05:30', '2015-07-07 22:15:00+05:30',
'2015-07-08 10:15:00+05:30', '2015-07-08 22:15:00+05:30',
'2015-07-09 10:15:00+05:30', '2015-07-09 22:15:00+05:30',
'2015-07-10 10:15:00+05:30', '2015-07-10 22:15:00+05:30',
'2015-07-11 10:15:00+05:30', '2015-07-11 22:15:00+05:30'],
dtype='datetime64[ns, Asia/Kolkata]', freq='12H')
rng.tz_convert('Australia/sydney')
DatetimeIndex(['2015-07-02 14:45:00+10:00', '2015-07-03 02:45:00+10:00',
'2015-07-03 14:45:00+10:00', '2015-07-04 02:45:00+10:00',
'2015-07-04 14:45:00+10:00', '2015-07-05 02:45:00+10:00',
'2015-07-05 14:45:00+10:00', '2015-07-06 02:45:00+10:00',
'2015-07-06 14:45:00+10:00', '2015-07-07 02:45:00+10:00',
'2015-07-07 14:45:00+10:00', '2015-07-08 02:45:00+10:00',
'2015-07-08 14:45:00+10:00', '2015-07-09 02:45:00+10:00',
'2015-07-09 14:45:00+10:00', '2015-07-10 02:45:00+10:00',
'2015-07-10 14:45:00+10:00', '2015-07-11 02:45:00+10:00',
'2015-07-11 14:45:00+10:00', '2015-07-12 02:45:00+10:00'],
dtype='datetime64[ns, Australia/Sydney]', freq='12H')
type(rng[0])
pandas._libs.tslibs.timestamps.Timestamp
문자열을 date 로 변환
dd=['07/07/2015', '08/12/2015', '12/04/2015']
dd
['07/07/2015', '08/12/2015', '12/04/2015']
type(dd[0])
str
list(pd.to_datetime(dd))
[Timestamp('2015-07-07 00:00:00'),
Timestamp('2015-08-12 00:00:00'),
Timestamp('2015-12-04 00:00:00')]
d = list(pd.to_datetime(dd, dayfirst=True))
d
[Timestamp('2015-07-07 00:00:00'),
Timestamp('2015-12-08 00:00:00'),
Timestamp('2015-04-12 00:00:00')]
type(d[0])
pandas._libs.tslibs.timestamps.Timestamp
periods
pr = pd.Period('2012', freq='M')
pr.asfreq('D', 'start')
Period('2012-01-01', 'D')
pr.asfreq('D', 'end')
Period('2012-01-31', 'D')
- period 연산
pr = pd.Period('2012', freq='A')
pr
Period('2012', 'A-DEC')
pr+1
Period('2013', 'A-DEC')
prMonth = pr.asfreq('M')
prMonth
Period('2012-12', 'M')
prMonth -1
Period('2012-11', 'M')
range 를 통한 period 생성
prg = pd.period_range('2010', '2015', freq='A')
prg
PeriodIndex(['2010', '2011', '2012', '2013', '2014', '2015'], dtype='period[A-DEC]', freq='A-DEC')
data = pd.Series(np.random.rand(len(prg)), index=prg)
data
2010 0.773062
2011 0.246164
2012 0.298947
2013 0.096472
2014 0.279433
2015 0.894872
Freq: A-DEC, dtype: float64
문자열로 된 날짜를 period로 변환
dates = ['2013-02-02', '2012-02-02', '2013-02-02']
d = pd.to_datetime(dates)
d
DatetimeIndex(['2013-02-02', '2012-02-02', '2013-02-02'], dtype='datetime64[ns]', freq=None)
prd = d.to_period(freq='M')
prd
PeriodIndex(['2013-02', '2012-02', '2013-02'], dtype='period[M]', freq='M')
prd.asfreq('D')
PeriodIndex(['2013-02-28', '2012-02-29', '2013-02-28'], dtype='period[D]', freq='D')
prd.asfreq('Y')
PeriodIndex(['2013', '2012', '2013'], dtype='period[A-DEC]', freq='A-DEC')
periods 를 timestamps로 변환
prd
PeriodIndex(['2013-02', '2012-02', '2013-02'], dtype='period[M]', freq='M')
prd.to_timestamp()
DatetimeIndex(['2013-02-01', '2012-02-01', '2013-02-01'], dtype='datetime64[ns]', freq=None)
prd.to_timestamp(how='end')
DatetimeIndex(['2013-02-28 23:59:59.999999999',
'2012-02-29 23:59:59.999999999',
'2013-02-28 23:59:59.999999999'],
dtype='datetime64[ns]', freq=None)
Time offsets
pd.Timedelta('3 days')
Timedelta('3 days 00:00:00')
pd.Timedelta('3M')
/Users/jinho/opt/anaconda3/envs/playdata/lib/python3.9/site-packages/IPython/core/interactiveshell.py:3441: FutureWarning: Units 'M', 'Y' and 'y' do not represent unambiguous timedelta values and will be removed in a future version
exec(code_obj, self.user_global_ns, self.user_ns)
Timedelta('0 days 00:03:00')
pd.Timedelta('4 days 3M')
Timedelta('4 days 00:03:00')
pd.Timestamp('9 July 2016 12:00') + pd.Timedelta('1 day 3 min')
Timestamp('2016-07-10 12:03:00')
rng + pd.Timedelta('1 day')
DatetimeIndex(['2015-07-03 10:15:00+05:30', '2015-07-03 22:15:00+05:30',
'2015-07-04 10:15:00+05:30', '2015-07-04 22:15:00+05:30',
'2015-07-05 10:15:00+05:30', '2015-07-05 22:15:00+05:30',
'2015-07-06 10:15:00+05:30', '2015-07-06 22:15:00+05:30',
'2015-07-07 10:15:00+05:30', '2015-07-07 22:15:00+05:30',
'2015-07-08 10:15:00+05:30', '2015-07-08 22:15:00+05:30',
'2015-07-09 10:15:00+05:30', '2015-07-09 22:15:00+05:30',
'2015-07-10 10:15:00+05:30', '2015-07-10 22:15:00+05:30',
'2015-07-11 10:15:00+05:30', '2015-07-11 22:15:00+05:30',
'2015-07-12 10:15:00+05:30', '2015-07-12 22:15:00+05:30'],
dtype='datetime64[ns, Asia/Kolkata]', freq='12H')
time 과 index data
dates = pd.date_range('2015-01-12', '2015-06-14', freq = 'M')
dates
DatetimeIndex(['2015-01-31', '2015-02-28', '2015-03-31', '2015-04-30',
'2015-05-31'],
dtype='datetime64[ns]', freq='M')
len(dates)
5
atemp = pd.Series([100.2, 98, 93, 98, 100], index=dates)
atemp
2015-01-31 100.2
2015-02-28 98.0
2015-03-31 93.0
2015-04-30 98.0
2015-05-31 100.0
Freq: M, dtype: float64
idx = atemp.index[3]
idx
Timestamp('2015-04-30 00:00:00', freq='M')
atemp[idx]
98.0
stemp = pd.Series([89, 98, 100, 88, 89], index=dates)
stemp
2015-01-31 89
2015-02-28 98
2015-03-31 100
2015-04-30 88
2015-05-31 89
Freq: M, dtype: int64
temps = pd.DataFrame({'Auckland':atemp, 'Delhi':stemp})
temps
Auckland | Delhi | |
---|---|---|
2015-01-31 | 100.2 | 89 |
2015-02-28 | 98.0 | 98 |
2015-03-31 | 93.0 | 100 |
2015-04-30 | 98.0 | 88 |
2015-05-31 | 100.0 | 89 |
temps['Auckland']
2015-01-31 100.2
2015-02-28 98.0
2015-03-31 93.0
2015-04-30 98.0
2015-05-31 100.0
Freq: M, Name: Auckland, dtype: float64
temps['Diff'] = temps['Auckland'] - temps['Delhi']
temps
Auckland | Delhi | Diff | |
---|---|---|---|
2015-01-31 | 100.2 | 89 | 11.2 |
2015-02-28 | 98.0 | 98 | 0.0 |
2015-03-31 | 93.0 | 100 | -7.0 |
2015-04-30 | 98.0 | 88 | 10.0 |
2015-05-31 | 100.0 | 89 | 11.0 |
del temps['Diff']
temps
Auckland | Delhi | |
---|---|---|
2015-01-31 | 100.2 | 89 |
2015-02-28 | 98.0 | 98 |
2015-03-31 | 93.0 | 100 |
2015-04-30 | 98.0 | 88 |
2015-05-31 | 100.0 | 89 |
응용
import pandas as pd
df = pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/time/stocks.csv')
df.head()
Unnamed: 0 | date | AA | GE | IBM | MSFT | |
---|---|---|---|---|---|---|
0 | 0 | 1990-02-01 00:00:00 | 4.98 | 2.87 | 16.79 | 0.51 |
1 | 1 | 1990-02-02 00:00:00 | 5.04 | 2.87 | 16.89 | 0.51 |
2 | 2 | 1990-02-05 00:00:00 | 5.07 | 2.87 | 17.32 | 0.51 |
3 | 3 | 1990-02-06 00:00:00 | 5.01 | 2.88 | 17.56 | 0.51 |
4 | 4 | 1990-02-07 00:00:00 | 5.04 | 2.91 | 17.93 | 0.51 |
d = df.date[0]
d
'1990-02-01 00:00:00'
type(d)
str
df = pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/time/stocks.csv',
parse_dates=['date'], index_col='date')
df.head()
Unnamed: 0 | AA | GE | IBM | MSFT | |
---|---|---|---|---|---|
date | |||||
1990-02-01 | 0 | 4.98 | 2.87 | 16.79 | 0.51 |
1990-02-02 | 1 | 5.04 | 2.87 | 16.89 | 0.51 |
1990-02-05 | 2 | 5.07 | 2.87 | 17.32 | 0.51 |
1990-02-06 | 3 | 5.01 | 2.88 | 17.56 | 0.51 |
1990-02-07 | 4 | 5.04 | 2.91 | 17.93 | 0.51 |
del df['Unnamed: 0']
df.head()
AA | GE | IBM | MSFT | |
---|---|---|---|---|
date | ||||
1990-02-01 | 4.98 | 2.87 | 16.79 | 0.51 |
1990-02-02 | 5.04 | 2.87 | 16.89 | 0.51 |
1990-02-05 | 5.07 | 2.87 | 17.32 | 0.51 |
1990-02-06 | 5.01 | 2.88 | 17.56 | 0.51 |
1990-02-07 | 5.04 | 2.91 | 17.93 | 0.51 |
df.index.name
'date'
resampling
stocks = pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/time/stocks.csv',
parse_dates=['date'])
stocks.head()
Unnamed: 0 | date | AA | GE | IBM | MSFT | |
---|---|---|---|---|---|---|
0 | 0 | 1990-02-01 | 4.98 | 2.87 | 16.79 | 0.51 |
1 | 1 | 1990-02-02 | 5.04 | 2.87 | 16.89 | 0.51 |
2 | 2 | 1990-02-05 | 5.07 | 2.87 | 17.32 | 0.51 |
3 | 3 | 1990-02-06 | 5.01 | 2.88 | 17.56 | 0.51 |
4 | 4 | 1990-02-07 | 5.04 | 2.91 | 17.93 | 0.51 |