Pandas 기초

import pandas as pd
casts = pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/cast.csv',
                    index_col=None, encoding='utf-8')
casts.head()
title year name type character n
0 Closet Monster 2015 Buffy #1 actor Buffy 4 31.0
1 Suuri illusioni 1985 Homo $ actor Guests 22.0
2 Battle of the Sexes 2017 $hutter actor Bobby Riggs Fan 10.0
3 Secret in Their Eyes 2015 $hutter actor 2002 Dodger Fan NaN
4 Steve Jobs 2015 $hutter actor 1988 Opera House Patron NaN

Unstack

  • 10년 주기로 영화를 그룹화
c = casts
c.groupby([c['year']//10*10, 'type']).size().head(8)
year  type   
1910  actor       384
      actress     285
1920  actor       710
      actress     411
1930  actor      2628
      actress     820
1940  actor      3014
      actress     983
dtype: int64
  • 유형에 따라 데이터를 그룹화
c = casts
c_decade = c.groupby(['type', c['year']//10*10]).size()
c_decade
type     year
actor    1910      384
         1920      710
         1930     2628
         1940     3014
         1950     2877
         1960     2775
         1970     3044
         1980     3565
         1990     5108
         2000    10368
         2010    15523
         2020        4
actress  1910      285
         1920      411
         1930      820
         1940      983
         1950     1015
         1960      968
         1970     1299
         1980     1989
         1990     2544
         2000     5831
         2010     8853
         2020        3
dtype: int64
  • unstack 으로 새로운 dataframe 생성
c_decade.unstack()
year 1910 1920 1930 1940 1950 1960 1970 1980 1990 2000 2010 2020
type
actor 384 710 2628 3014 2877 2775 3044 3565 5108 10368 15523 4
actress 285 411 820 983 1015 968 1299 1989 2544 5831 8853 3
import matplotlib.pyplot as plt
c_decade.unstack().plot()
/Users/jinho/opt/anaconda3/lib/python3.8/site-packages/pandas/plotting/_matplotlib/core.py:1192: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(xticklabels)





<AxesSubplot:xlabel='type'>
plt.show()

png

c_decade.unstack().plot(kind='bar')
plt.show()

png

  • 데이터를 나란히 표시하기 위해 unstack(0) 사용
c_decade.unstack(0)
type actor actress
year
1910 384 285
1920 710 411
1930 2628 820
1940 3014 983
1950 2877 1015
1960 2775 968
1970 3044 1299
1980 3565 1989
1990 5108 2544
2000 10368 5831
2010 15523 8853
2020 4 3
c_decade.unstack(0).plot(kind='bar')
plt.show()

png

merge

동일한 유형의 다른 데이터 파일을 병합하는 것

release = pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/release_dates.csv',
                    index_col=None, encoding='utf-8')
release.head()
title year country date
0 #73, Shaanthi Nivaasa 2007 India 2007-06-15
1 #Beings 2015 Romania 2015-01-29
2 #Declimax 2018 Netherlands 2018-01-21
3 #Ewankosau saranghaeyo 2015 Philippines 2015-01-21
4 #Horror 2015 USA 2015-11-20
  • casts 파일에서 Amelia 를 필터링
c_amelia = casts[casts['title']=='Amelia']
c_amelia.head()
title year name type character n
5767 Amelia 2009 Aaron Abrams actor Slim Gordon 8.0
23319 Amelia 2009 Jeremy Akerman actor Sheriff 19.0
  • release 파일에서 개봉일을 알아본다.
release[release['title']=='Amelia'].head()
title year country date
20543 Amelia 1966 Mexico 1966-03-10
20544 Amelia 2009 Canada 2009-10-23
20545 Amelia 2009 USA 2009-10-23
20546 Amelia 2009 Australia 2009-11-12
20547 Amelia 2009 Singapore 2009-11-12
  • 두개의 데이터 병합
c_amelia.merge(release).head()
title year name type character n country date
0 Amelia 2009 Aaron Abrams actor Slim Gordon 8.0 Canada 2009-10-23
1 Amelia 2009 Aaron Abrams actor Slim Gordon 8.0 USA 2009-10-23
2 Amelia 2009 Aaron Abrams actor Slim Gordon 8.0 Australia 2009-11-12
3 Amelia 2009 Aaron Abrams actor Slim Gordon 8.0 Singapore 2009-11-12
4 Amelia 2009 Aaron Abrams actor Slim Gordon 8.0 Ireland 2009-11-13
  • name 이 Aaron Abrams인 항목 필터링
c = casts[casts['name']=='Aaron Abrams']
c.head(2)
title year name type character n
5765 #FromJennifer 2017 Aaron Abrams actor Ralph Sinclair NaN
5766 388 Arletta Avenue 2011 Aaron Abrams actor Alex 4.0
  • 제목과 년도를 기준으로 병합
  • 병합하려면 영화 이름과 연도가 같아야한다
c.merge(casts, on=['title', 'year']).head()
title year name_x type_x character_x n_x name_y type_y character_y n_y
0 #FromJennifer 2017 Aaron Abrams actor Ralph Sinclair NaN Aaron Abrams actor Ralph Sinclair NaN
1 #FromJennifer 2017 Aaron Abrams actor Ralph Sinclair NaN Christian Ackerman actor Simon NaN
2 388 Arletta Avenue 2011 Aaron Abrams actor Alex 4.0 Graham Abbey actor Officer #2 8.0
3 388 Arletta Avenue 2011 Aaron Abrams actor Alex 4.0 Aaron Abrams actor Alex 4.0
4 Amelia 2009 Aaron Abrams actor Slim Gordon 8.0 Aaron Abrams actor Slim Gordon 8.0
c_costar = c.merge(casts, on=['title', 'year'])
c_costar = c_costar[c_costar['name_y']!='Aaron Abrams']
c_costar.head()
title year name_x type_x character_x n_x name_y type_y character_y n_y
1 #FromJennifer 2017 Aaron Abrams actor Ralph Sinclair NaN Christian Ackerman actor Simon NaN
2 388 Arletta Avenue 2011 Aaron Abrams actor Alex 4.0 Graham Abbey actor Officer #2 8.0
5 Amelia 2009 Aaron Abrams actor Slim Gordon 8.0 Jeremy Akerman actor Sheriff 19.0
8 Cinderella Man 2005 Aaron Abrams actor 1928 Fan 67.0 Nick Alachiotis actor Baer Cornerman 38.0
9 Cinderella Man 2005 Aaron Abrams actor 1928 Fan 67.0 Nick Alachiotis actor Undercard Boxer - Feldman 38.0

Index

데이터를 구성하고 데이터에 대한 빠른 액세스를 가능하게한다. %%timeit을 사용하여 작업에 필요한 시간을 비교할 수 있다.

import pandas as pd
cast = pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/cast.csv',
                    index_col=None, encoding='utf-8')
cast.head()
title year name type character n
0 Closet Monster 2015 Buffy #1 actor Buffy 4 31.0
1 Suuri illusioni 1985 Homo $ actor Guests 22.0
2 Battle of the Sexes 2017 $hutter actor Bobby Riggs Fan 10.0
3 Secret in Their Eyes 2015 $hutter actor 2002 Dodger Fan NaN
4 Steve Jobs 2015 $hutter actor 1988 Opera House Patron NaN
  • index 생성
%%time
# indexing없이 데이터 접근

cast[cast['title']=='Macbeth']
CPU times: user 14.3 ms, sys: 1.64 ms, total: 16 ms
Wall time: 16.9 ms
title year name type character n
12868 Macbeth 2015 Darren Adamson actor Soldier NaN
22302 Macbeth 1916 Spottiswoode Aitken actor Duncan 4.0
25855 Macbeth 1948 Robert Alan actor Third Murderer NaN
26990 Macbeth 2016 John Albasiny actor Doctor NaN
38090 Macbeth 1948 William Alland actor Second Murderer 18.0
40639 Macbeth 1997 Stevie Allen actor Murderer 21.0
60543 Macbeth 2014 Moyo Akand? actress Witch NaN
63776 Macbeth 1916 Mary Alden actress Lady Macduff 6.0
%%timeit

# indexing 없이 데이터 접근
cast[cast['title']=='Macbeth']
3.72 ms ± 23.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
  • set index 를 통해 데이터에 대한 인덱스를 생성한다.
c = cast.set_index(['title'])
c.head(4)
year name type character n
title
Closet Monster 2015 Buffy #1 actor Buffy 4 31.0
Suuri illusioni 1985 Homo $ actor Guests 22.0
Battle of the Sexes 2017 $hutter actor Bobby Riggs Fan 10.0
Secret in Their Eyes 2015 $hutter actor 2002 Dodger Fan NaN
  • 더 빠른 작업을 위해 .loc 를 사용
%%time

c.loc['Macbeth']
CPU times: user 16.3 ms, sys: 2.59 ms, total: 18.9 ms
Wall time: 21.5 ms
year name type character n
title
Macbeth 2015 Darren Adamson actor Soldier NaN
Macbeth 1916 Spottiswoode Aitken actor Duncan 4.0
Macbeth 1948 Robert Alan actor Third Murderer NaN
Macbeth 2016 John Albasiny actor Doctor NaN
Macbeth 1948 William Alland actor Second Murderer 18.0
Macbeth 1997 Stevie Allen actor Murderer 21.0
Macbeth 2014 Moyo Akand? actress Witch NaN
Macbeth 1916 Mary Alden actress Lady Macduff 6.0
%%timeit

c.loc['Macbeth']
1.08 ms ± 2.13 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
  • 더 빠른 작업을 위해 인덱스를 정렬한다.
cs = cast.set_index(['title']).sort_index()
cs.tail(4)
year name type character n
title
xXx: Return of Xander Cage 2017 Julie Abcede actor Catwalk Partiers 84.0
xXx: Return of Xander Cage 2017 Jeimi Abila actress Lazarus' Girls 64.0
xXx: Return of Xander Cage 2017 Wayne Ambrose actor Choir Members 34.0
xXx: State of the Union 2005 Robert Alonzo actor Guard NaN
%%time
cs.loc['Macbeth']
CPU times: user 20.3 ms, sys: 804 µs, total: 21.1 ms
Wall time: 22.7 ms
year name type character n
title
Macbeth 1948 William Alland actor Second Murderer 18.0
Macbeth 1916 Spottiswoode Aitken actor Duncan 4.0
Macbeth 1916 Mary Alden actress Lady Macduff 6.0
Macbeth 1997 Stevie Allen actor Murderer 21.0
Macbeth 1948 Robert Alan actor Third Murderer NaN
Macbeth 2016 John Albasiny actor Doctor NaN
Macbeth 2014 Moyo Akand? actress Witch NaN
Macbeth 2015 Darren Adamson actor Soldier NaN
%%timeit

cs.loc['Macbeth']
70 µs ± 392 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

Multiple index

데이터에 여러 인덱스를 갖게 할 수 있다.

cm = cast.set_index(['title', 'n']).sort_index()
cm.tail(30)
year name type character
title n
Zwei in einem Anzug 2.0 1950 Wolf Albach-Retty actor Otto Vogel
Zwei in einem Auto 2.0 1951 Wolf Albach-Retty actor Georg Schmittlein
Zweimal zwei im Himmelbett 1.0 1937 Georg Alexander actor Arnd Krusemark
Zwischen Lachen und Weinen NaN 1919 Georg Alexander actor Hans
Zwischen Pankow und Zehlendorf 3.0 1991 Kathrin Ackermann actress Nora Permont
NaN 1991 Eugen Albert actor Soldat
Zwischen Welten 2.0 2014 Mohsin Ahmady actor Tarik
Zwischen den Sternen 2.0 2002 Fatih Alas actor Umut
Zwischen vierzehn und siebzehn - Sexualnot der Jugend 5.0 1929 Fritz Alberti actor Parent
Zwischengleis 7.0 1978 Alexander Allerson actor Bolle
Zyat iz provintsii NaN 1987 Dinmukhamet Akhimov actor ?Taksist?
NaN 1987 Kalampyr Ajsangaliyeva actress Luzhan
Zzikhimyeon jukneunda 3.0 2000 Jae-hwan Ahn actor The Teacher
bestefreunde 3.0 2014 Tina Amon Amonsen actress Vivian
bwoy NaN 2016 Drew (IV) Allen actor Boss
eCupid 33.0 2011 Christy Alvarado actress Patron
47.0 2011 Shon Abram actor Patron
el Ziara NaN 2014 Manel Abdelkoui actor Randa
fl 19,99 NaN 1998 Thomas Acda actor Harrie Harings
NaN 1998 Marijke Aerts actress Mevrouw van Veen
iBoy 9.0 2017 Shaquille Ali-Yebuah actor Cass
inter.m@tes 6.0 2004 Jojo Alejar actor Gus 'Hopeless' Geronimo
kingdom of ants NaN 2016 Zayar Abdulla actor Hadi
NaN 2016 Rupak Abdulkadr actress Rupak mother
les aventures d'archives NaN 2016 Buzz Aldrin actor Himself
w Delta z 8.0 2007 Barbara Adair actress Alice Jackson
xXx: Return of Xander Cage 34.0 2017 Wayne Ambrose actor Choir Members
64.0 2017 Jeimi Abila actress Lazarus' Girls
84.0 2017 Julie Abcede actor Catwalk Partiers
xXx: State of the Union NaN 2005 Robert Alonzo actor Guard
cm.loc['Macbeth']
year name type character
n
4.0 1916 Spottiswoode Aitken actor Duncan
6.0 1916 Mary Alden actress Lady Macduff
18.0 1948 William Alland actor Second Murderer
21.0 1997 Stevie Allen actor Murderer
NaN 2015 Darren Adamson actor Soldier
NaN 1948 Robert Alan actor Third Murderer
NaN 2016 John Albasiny actor Doctor
NaN 2014 Moyo Akand? actress Witch
cm.loc['Macbeth'].loc[4:18]
year name type character
n
4.0 1916 Spottiswoode Aitken actor Duncan
6.0 1916 Mary Alden actress Lady Macduff
18.0 1948 William Alland actor Second Murderer
cm.loc['Macbeth'].loc[4]
year                        1916
name         Spottiswoode Aitken
type                       actor
character                 Duncan
Name: 4.0, dtype: object
cm.head(2)
year name type character
title n
#1 Serial Killer 17.0 2013 Michael Alton actor Detective Roberts
#DigitalLivesMatter NaN 2016 Rashan Ali actress News Reporter

reset index

reset_index( ) 을 사용하여 인덱스를 제설정 할 수 있다.

cm = cm.reset_index('n')
cm.head(2)
n year name type character
title
#1 Serial Killer 17.0 2013 Michael Alton actor Detective Roberts
#DigitalLivesMatter NaN 2016 Rashan Ali actress News Reporter

CSV library

read file

import csv
titles = list(csv.DictReader(open('./pythondsp-pandasguide-b936c3b43406/data/titles.csv')))
titles[0:5]
[{'title': 'The Rising Son', 'year': '1990'},
 {'title': 'The Thousand Plane Raid', 'year': '1969'},
 {'title': 'Crucea de piatra', 'year': '1993'},
 {'title': 'Country', 'year': '2000'},
 {'title': 'Gaiking II', 'year': '2011'}]
titles[-5:]
[{'title': 'Rebel', 'year': '1970'},
 {'title': 'Suzanne', 'year': '1996'},
 {'title': 'Bomba', 'year': '2013'},
 {'title': 'Aao Jao Ghar Tumhara', 'year': '1984'},
 {'title': 'Mrs. Munck', 'year': '1995'}]
  • 별도의 행에 제목과 연도 표시
for k, v in titles[0].items():
    print(k, ':', v)
title : The Rising Son
year : 1990

연도에 따라 영화 표시

  • 1985년의 모든 영화 표시
year85 = [a for a in titles if a['year']=='1985']
year85[:5]
[{'title': 'Insaaf Main Karoonga', 'year': '1985'},
 {'title': 'Vivre pour survivre', 'year': '1985'},
 {'title': 'Water', 'year': '1985'},
 {'title': 'Doea tanda mata', 'year': '1985'},
 {'title': 'Koritsia gia tsibima', 'year': '1985'}]
  • 1990~1999년 영화표시
movies90 = [m for m in titles if (int(m['year'])<int('2000')) and (int(m['year'])>int('1989'))]
movies90[:5]
[{'title': 'The Rising Son', 'year': '1990'},
 {'title': 'Crucea de piatra', 'year': '1993'},
 {'title': 'Poka Makorer Ghar Bosoti', 'year': '1996'},
 {'title': 'Maa Durga Shakti', 'year': '1999'},
 {'title': 'Conflict of Interest', 'year': '1993'}]
  • 모든 ‘Macbeth’ 찾기
macbeth = [m for m in titles if m['title']=='Macbeth']
macbeth[:3]
[{'title': 'Macbeth', 'year': '1913'},
 {'title': 'Macbeth', 'year': '2006'},
 {'title': 'Macbeth', 'year': '2013'}]

itemgetter

  • 연도별로 영화 정렬
from operator import itemgetter
sorted(macbeth, key=itemgetter('year'))[:3]
[{'title': 'Macbeth', 'year': '1913'},
 {'title': 'Macbeth', 'year': '1997'},
 {'title': 'Macbeth', 'year': '1998'}]

공백 0으로 채우기

casts = list(csv.DictReader(open('./pythondsp-pandasguide-b936c3b43406/data/cast.csv')))
casts[3:5]
[{'title': 'Secret in Their Eyes',
  'year': '2015',
  'name': '$hutter',
  'type': 'actor',
  'character': '2002 Dodger Fan',
  'n': ''},
 {'title': 'Steve Jobs',
  'year': '2015',
  'name': '$hutter',
  'type': 'actor',
  'character': '1988 Opera House Patron',
  'n': ''}]
  • 0으로 대체
cast0 = [{**c, 'n':c['n'].replace('', '0')} for c in casts]
cast0[3:5]
[{'title': 'Secret in Their Eyes',
  'year': '2015',
  'name': '$hutter',
  'type': 'actor',
  'character': '2002 Dodger Fan',
  'n': '0'},
 {'title': 'Steve Jobs',
  'year': '2015',
  'name': '$hutter',
  'type': 'actor',
  'character': '1988 Opera House Patron',
  'n': '0'}]
  • Maa 로 시작하는 영화
maa = [m for m in titles if m['title'].startswith('Maa')]
maa[:3]
[{'title': 'Maa Durga Shakti', 'year': '1999'},
 {'title': 'Maarek hob', 'year': '2004'},
 {'title': 'Maa Aur Mamta', 'year': '1970'}]

collections.counter

  • 연도별로 영화계산
from collections import Counter

by_year = Counter(t['year']for t in titles)
by_year.most_common(3)
[('2016', 2363), ('2017', 2138), ('2015', 1849)]
import matplotlib.pyplot as plt

data = by_year.most_common(len(titles))
data = sorted(data)
x = [c[0] for c in data]
y = [c[1] for c in data]
plt.plot(x, y)
plt.show()

png

collections.defaultdict

  • 연도별로 dictionary에 영화 추가
from collections import defaultdict
d = defaultdict(list)
for row in titles:
    d[row['year']].append(row['title'])
    
xx=[]
yy=[]
for k, v in d.items():
    xx.append(k)
    yy.append(len(v))

plt.plot(sorted(xx), yy)
plt.show()

png

xx[:5]
['1990', '1969', '1993', '2000', '2011']
yy[:5]
[515, 465, 437, 616, 1457]
  • Aaron Abrams 의 모든 영화 보기
cf = [c for c in casts if c['name']=='Aaron Abrams']
cf[:3]
[{'title': '#FromJennifer',
  'year': '2017',
  'name': 'Aaron Abrams',
  'type': 'actor',
  'character': 'Ralph Sinclair',
  'n': ''},
 {'title': '388 Arletta Avenue',
  'year': '2011',
  'name': 'Aaron Abrams',
  'type': 'actor',
  'character': 'Alex',
  'n': '4'},
 {'title': 'Amelia',
  'year': '2009',
  'name': 'Aaron Abrams',
  'type': 'actor',
  'character': 'Slim Gordon',
  'n': '8'}]
dcf = defaultdict(list)
for row in cf:
    dcf[row['year']].append(row['title'])
    
dcf
defaultdict(list,
            {'2017': ['#FromJennifer', 'The Go-Getters'],
             '2011': ['388 Arletta Avenue',
              'Jesus Henry Christ',
              'Jesus Henry Christ',
              'Take This Waltz',
              'The Chicago 8'],
             '2009': ['Amelia', 'At Home by Myself... with You'],
             '2005': ['Cinderella Man', 'Sabah'],
             '2015': ['Closet Monster', 'Regression'],
             '2018': ['Code 8'],
             '2007': ['Firehouse Dog', 'Young People Fucking'],
             '2008': ['Flash of Genius'],
             '2013': ['It Was You Charlie'],
             '2004': ['Resident Evil: Apocalypse', 'Siblings'],
             '2003': ['The In-Laws', 'The Visual Bible: The Gospel of John'],
             '2006': ['Zoom']})