Pandas 기초

import pandas as pd

casts = pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/cast.csv',
                    index_col=None, encoding='utf-8')
casts.head()

	title	year	name	type	character	n
0	Closet Monster	2015	Buffy #1	actor	Buffy 4	31.0
1	Suuri illusioni	1985	Homo $	actor	Guests	22.0
2	Battle of the Sexes	2017	$hutter	actor	Bobby Riggs Fan	10.0
3	Secret in Their Eyes	2015	$hutter	actor	2002 Dodger Fan	NaN
4	Steve Jobs	2015	$hutter	actor	1988 Opera House Patron	NaN

Unstack

10년 주기로 영화를 그룹화

c = casts
c.groupby([c['year']//10*10, 'type']).size().head(8)

year  type   
1910  actor       384
      actress     285
1920  actor       710
      actress     411
1930  actor      2628
      actress     820
1940  actor      3014
      actress     983
dtype: int64

유형에 따라 데이터를 그룹화

c = casts
c_decade = c.groupby(['type', c['year']//10*10]).size()
c_decade

type     year
actor    1910      384
         1920      710
         1930     2628
         1940     3014
         1950     2877
         1960     2775
         1970     3044
         1980     3565
         1990     5108
         2000    10368
         2010    15523
         2020        4
actress  1910      285
         1920      411
         1930      820
         1940      983
         1950     1015
         1960      968
         1970     1299
         1980     1989
         1990     2544
         2000     5831
         2010     8853
         2020        3
dtype: int64

unstack 으로 새로운 dataframe 생성

c_decade.unstack()

year	1910	1920	1930	1940	1950	1960	1970	1980	1990	2000	2010	2020
type
actor	384	710	2628	3014	2877	2775	3044	3565	5108	10368	15523	4
actress	285	411	820	983	1015	968	1299	1989	2544	5831	8853	3

import matplotlib.pyplot as plt

c_decade.unstack().plot()

/Users/jinho/opt/anaconda3/lib/python3.8/site-packages/pandas/plotting/_matplotlib/core.py:1192: UserWarning: FixedFormatter should only be used together with FixedLocator
  ax.set_xticklabels(xticklabels)

<AxesSubplot:xlabel='type'>

plt.show()

png

c_decade.unstack().plot(kind='bar')
plt.show()

png

데이터를 나란히 표시하기 위해 unstack(0) 사용

c_decade.unstack(0)

type	actor	actress
year
1910	384	285
1920	710	411
1930	2628	820
1940	3014	983
1950	2877	1015
1960	2775	968
1970	3044	1299
1980	3565	1989
1990	5108	2544
2000	10368	5831
2010	15523	8853
2020	4	3

c_decade.unstack(0).plot(kind='bar')
plt.show()

png

merge

동일한 유형의 다른 데이터 파일을 병합하는 것

release = pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/release_dates.csv',
                    index_col=None, encoding='utf-8')
release.head()

	title	year	country	date
0	#73, Shaanthi Nivaasa	2007	India	2007-06-15
1	#Beings	2015	Romania	2015-01-29
2	#Declimax	2018	Netherlands	2018-01-21
3	#Ewankosau saranghaeyo	2015	Philippines	2015-01-21
4	#Horror	2015	USA	2015-11-20

casts 파일에서 Amelia 를 필터링

c_amelia = casts[casts['title']=='Amelia']
c_amelia.head()

	title	year	name	type	character	n
5767	Amelia	2009	Aaron Abrams	actor	Slim Gordon	8.0
23319	Amelia	2009	Jeremy Akerman	actor	Sheriff	19.0

release 파일에서 개봉일을 알아본다.

release[release['title']=='Amelia'].head()

	title	year	country	date
20543	Amelia	1966	Mexico	1966-03-10
20544	Amelia	2009	Canada	2009-10-23
20545	Amelia	2009	USA	2009-10-23
20546	Amelia	2009	Australia	2009-11-12
20547	Amelia	2009	Singapore	2009-11-12

두개의 데이터 병합

c_amelia.merge(release).head()

	title	year	name	type	character	n	country	date
0	Amelia	2009	Aaron Abrams	actor	Slim Gordon	8.0	Canada	2009-10-23
1	Amelia	2009	Aaron Abrams	actor	Slim Gordon	8.0	USA	2009-10-23
2	Amelia	2009	Aaron Abrams	actor	Slim Gordon	8.0	Australia	2009-11-12
3	Amelia	2009	Aaron Abrams	actor	Slim Gordon	8.0	Singapore	2009-11-12
4	Amelia	2009	Aaron Abrams	actor	Slim Gordon	8.0	Ireland	2009-11-13

name 이 Aaron Abrams인 항목 필터링

c = casts[casts['name']=='Aaron Abrams']
c.head(2)

	title	year	name	type	character	n
5765	#FromJennifer	2017	Aaron Abrams	actor	Ralph Sinclair	NaN
5766	388 Arletta Avenue	2011	Aaron Abrams	actor	Alex	4.0

제목과 년도를 기준으로 병합
병합하려면 영화 이름과 연도가 같아야한다

c.merge(casts, on=['title', 'year']).head()

	title	year	name_x	type_x	character_x	n_x	name_y	type_y	character_y	n_y
0	#FromJennifer	2017	Aaron Abrams	actor	Ralph Sinclair	NaN	Aaron Abrams	actor	Ralph Sinclair	NaN
1	#FromJennifer	2017	Aaron Abrams	actor	Ralph Sinclair	NaN	Christian Ackerman	actor	Simon	NaN
2	388 Arletta Avenue	2011	Aaron Abrams	actor	Alex	4.0	Graham Abbey	actor	Officer #2	8.0
3	388 Arletta Avenue	2011	Aaron Abrams	actor	Alex	4.0	Aaron Abrams	actor	Alex	4.0
4	Amelia	2009	Aaron Abrams	actor	Slim Gordon	8.0	Aaron Abrams	actor	Slim Gordon	8.0

c_costar = c.merge(casts, on=['title', 'year'])
c_costar = c_costar[c_costar['name_y']!='Aaron Abrams']
c_costar.head()

	title	year	name_x	type_x	character_x	n_x	name_y	type_y	character_y	n_y
1	#FromJennifer	2017	Aaron Abrams	actor	Ralph Sinclair	NaN	Christian Ackerman	actor	Simon	NaN
2	388 Arletta Avenue	2011	Aaron Abrams	actor	Alex	4.0	Graham Abbey	actor	Officer #2	8.0
5	Amelia	2009	Aaron Abrams	actor	Slim Gordon	8.0	Jeremy Akerman	actor	Sheriff	19.0
8	Cinderella Man	2005	Aaron Abrams	actor	1928 Fan	67.0	Nick Alachiotis	actor	Baer Cornerman	38.0
9	Cinderella Man	2005	Aaron Abrams	actor	1928 Fan	67.0	Nick Alachiotis	actor	Undercard Boxer - Feldman	38.0

Index

데이터를 구성하고 데이터에 대한 빠른 액세스를 가능하게한다. %%timeit을 사용하여 작업에 필요한 시간을 비교할 수 있다.

import pandas as pd
cast = pd.read_csv('./pythondsp-pandasguide-b936c3b43406/data/cast.csv',
                    index_col=None, encoding='utf-8')
cast.head()

	title	year	name	type	character	n
0	Closet Monster	2015	Buffy #1	actor	Buffy 4	31.0
1	Suuri illusioni	1985	Homo $	actor	Guests	22.0
2	Battle of the Sexes	2017	$hutter	actor	Bobby Riggs Fan	10.0
3	Secret in Their Eyes	2015	$hutter	actor	2002 Dodger Fan	NaN
4	Steve Jobs	2015	$hutter	actor	1988 Opera House Patron	NaN

index 생성

%%time
# indexing없이 데이터 접근

cast[cast['title']=='Macbeth']

CPU times: user 14.3 ms, sys: 1.64 ms, total: 16 ms
Wall time: 16.9 ms

	title	year	name	type	character	n
12868	Macbeth	2015	Darren Adamson	actor	Soldier	NaN
22302	Macbeth	1916	Spottiswoode Aitken	actor	Duncan	4.0
25855	Macbeth	1948	Robert Alan	actor	Third Murderer	NaN
26990	Macbeth	2016	John Albasiny	actor	Doctor	NaN
38090	Macbeth	1948	William Alland	actor	Second Murderer	18.0
40639	Macbeth	1997	Stevie Allen	actor	Murderer	21.0
60543	Macbeth	2014	Moyo Akand?	actress	Witch	NaN
63776	Macbeth	1916	Mary Alden	actress	Lady Macduff	6.0

%%timeit

# indexing 없이 데이터 접근
cast[cast['title']=='Macbeth']

3.72 ms ± 23.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)

set index 를 통해 데이터에 대한 인덱스를 생성한다.

c = cast.set_index(['title'])
c.head(4)

	year	name	type	character	n
title
Closet Monster	2015	Buffy #1	actor	Buffy 4	31.0
Suuri illusioni	1985	Homo $	actor	Guests	22.0
Battle of the Sexes	2017	$hutter	actor	Bobby Riggs Fan	10.0
Secret in Their Eyes	2015	$hutter	actor	2002 Dodger Fan	NaN

더 빠른 작업을 위해 .loc 를 사용

%%time

c.loc['Macbeth']

CPU times: user 16.3 ms, sys: 2.59 ms, total: 18.9 ms
Wall time: 21.5 ms

	year	name	type	character	n
title
Macbeth	2015	Darren Adamson	actor	Soldier	NaN
Macbeth	1916	Spottiswoode Aitken	actor	Duncan	4.0
Macbeth	1948	Robert Alan	actor	Third Murderer	NaN
Macbeth	2016	John Albasiny	actor	Doctor	NaN
Macbeth	1948	William Alland	actor	Second Murderer	18.0
Macbeth	1997	Stevie Allen	actor	Murderer	21.0
Macbeth	2014	Moyo Akand?	actress	Witch	NaN
Macbeth	1916	Mary Alden	actress	Lady Macduff	6.0

%%timeit

c.loc['Macbeth']

1.08 ms ± 2.13 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

더 빠른 작업을 위해 인덱스를 정렬한다.

cs = cast.set_index(['title']).sort_index()
cs.tail(4)

	year	name	type	character	n
title
xXx: Return of Xander Cage	2017	Julie Abcede	actor	Catwalk Partiers	84.0
xXx: Return of Xander Cage	2017	Jeimi Abila	actress	Lazarus' Girls	64.0
xXx: Return of Xander Cage	2017	Wayne Ambrose	actor	Choir Members	34.0
xXx: State of the Union	2005	Robert Alonzo	actor	Guard	NaN

%%time
cs.loc['Macbeth']

CPU times: user 20.3 ms, sys: 804 µs, total: 21.1 ms
Wall time: 22.7 ms

	year	name	type	character	n
title
Macbeth	1948	William Alland	actor	Second Murderer	18.0
Macbeth	1916	Spottiswoode Aitken	actor	Duncan	4.0
Macbeth	1916	Mary Alden	actress	Lady Macduff	6.0
Macbeth	1997	Stevie Allen	actor	Murderer	21.0
Macbeth	1948	Robert Alan	actor	Third Murderer	NaN
Macbeth	2016	John Albasiny	actor	Doctor	NaN
Macbeth	2014	Moyo Akand?	actress	Witch	NaN
Macbeth	2015	Darren Adamson	actor	Soldier	NaN

%%timeit

cs.loc['Macbeth']

70 µs ± 392 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

Multiple index

데이터에 여러 인덱스를 갖게 할 수 있다.

cm = cast.set_index(['title', 'n']).sort_index()
cm.tail(30)

		year	name	type	character
title	n
Zwei in einem Anzug	2.0	1950	Wolf Albach-Retty	actor	Otto Vogel
Zwei in einem Auto	2.0	1951	Wolf Albach-Retty	actor	Georg Schmittlein
Zweimal zwei im Himmelbett	1.0	1937	Georg Alexander	actor	Arnd Krusemark
Zwischen Lachen und Weinen	NaN	1919	Georg Alexander	actor	Hans
Zwischen Pankow und Zehlendorf	3.0	1991	Kathrin Ackermann	actress	Nora Permont
Zwischen Pankow und Zehlendorf	NaN	1991	Eugen Albert	actor	Soldat
Zwischen Welten	2.0	2014	Mohsin Ahmady	actor	Tarik
Zwischen den Sternen	2.0	2002	Fatih Alas	actor	Umut
Zwischen vierzehn und siebzehn - Sexualnot der Jugend	5.0	1929	Fritz Alberti	actor	Parent
Zwischengleis	7.0	1978	Alexander Allerson	actor	Bolle
Zyat iz provintsii	NaN	1987	Dinmukhamet Akhimov	actor	?Taksist?
Zyat iz provintsii	NaN	1987	Kalampyr Ajsangaliyeva	actress	Luzhan
Zzikhimyeon jukneunda	3.0	2000	Jae-hwan Ahn	actor	The Teacher
bestefreunde	3.0	2014	Tina Amon Amonsen	actress	Vivian
bwoy	NaN	2016	Drew (IV) Allen	actor	Boss
eCupid	33.0	2011	Christy Alvarado	actress	Patron
eCupid	47.0	2011	Shon Abram	actor	Patron
el Ziara	NaN	2014	Manel Abdelkoui	actor	Randa
fl 19,99	NaN	1998	Thomas Acda	actor	Harrie Harings
fl 19,99	NaN	1998	Marijke Aerts	actress	Mevrouw van Veen
iBoy	9.0	2017	Shaquille Ali-Yebuah	actor	Cass
inter.m@tes	6.0	2004	Jojo Alejar	actor	Gus 'Hopeless' Geronimo
kingdom of ants	NaN	2016	Zayar Abdulla	actor	Hadi
kingdom of ants	NaN	2016	Rupak Abdulkadr	actress	Rupak mother
les aventures d'archives	NaN	2016	Buzz Aldrin	actor	Himself
w Delta z	8.0	2007	Barbara Adair	actress	Alice Jackson
xXx: Return of Xander Cage	34.0	2017	Wayne Ambrose	actor	Choir Members
	64.0	2017	Jeimi Abila	actress	Lazarus' Girls
	84.0	2017	Julie Abcede	actor	Catwalk Partiers
xXx: State of the Union	NaN	2005	Robert Alonzo	actor	Guard

cm.loc['Macbeth']

	year	name	type	character
n
4.0	1916	Spottiswoode Aitken	actor	Duncan
6.0	1916	Mary Alden	actress	Lady Macduff
18.0	1948	William Alland	actor	Second Murderer
21.0	1997	Stevie Allen	actor	Murderer
NaN	2015	Darren Adamson	actor	Soldier
NaN	1948	Robert Alan	actor	Third Murderer
NaN	2016	John Albasiny	actor	Doctor
NaN	2014	Moyo Akand?	actress	Witch

cm.loc['Macbeth'].loc[4:18]

	year	name	type	character
n
4.0	1916	Spottiswoode Aitken	actor	Duncan
6.0	1916	Mary Alden	actress	Lady Macduff
18.0	1948	William Alland	actor	Second Murderer

cm.loc['Macbeth'].loc[4]

year                        1916
name         Spottiswoode Aitken
type                       actor
character                 Duncan
Name: 4.0, dtype: object

cm.head(2)

		year	name	type	character
title	n
#1 Serial Killer	17.0	2013	Michael Alton	actor	Detective Roberts
#DigitalLivesMatter	NaN	2016	Rashan Ali	actress	News Reporter

reset index

reset_index( ) 을 사용하여 인덱스를 제설정 할 수 있다.

cm = cm.reset_index('n')
cm.head(2)

	n	year	name	type	character
title
#1 Serial Killer	17.0	2013	Michael Alton	actor	Detective Roberts
#DigitalLivesMatter	NaN	2016	Rashan Ali	actress	News Reporter

CSV library

read file

import csv
titles = list(csv.DictReader(open('./pythondsp-pandasguide-b936c3b43406/data/titles.csv')))
titles[0:5]

[{'title': 'The Rising Son', 'year': '1990'},
 {'title': 'The Thousand Plane Raid', 'year': '1969'},
 {'title': 'Crucea de piatra', 'year': '1993'},
 {'title': 'Country', 'year': '2000'},
 {'title': 'Gaiking II', 'year': '2011'}]

titles[-5:]

[{'title': 'Rebel', 'year': '1970'},
 {'title': 'Suzanne', 'year': '1996'},
 {'title': 'Bomba', 'year': '2013'},
 {'title': 'Aao Jao Ghar Tumhara', 'year': '1984'},
 {'title': 'Mrs. Munck', 'year': '1995'}]

별도의 행에 제목과 연도 표시

for k, v in titles[0].items():
    print(k, ':', v)

title : The Rising Son
year : 1990

연도에 따라 영화 표시

1985년의 모든 영화 표시

year85 = [a for a in titles if a['year']=='1985']
year85[:5]

[{'title': 'Insaaf Main Karoonga', 'year': '1985'},
 {'title': 'Vivre pour survivre', 'year': '1985'},
 {'title': 'Water', 'year': '1985'},
 {'title': 'Doea tanda mata', 'year': '1985'},
 {'title': 'Koritsia gia tsibima', 'year': '1985'}]

1990~1999년 영화표시

movies90 = [m for m in titles if (int(m['year'])<int('2000')) and (int(m['year'])>int('1989'))]
movies90[:5]

[{'title': 'The Rising Son', 'year': '1990'},
 {'title': 'Crucea de piatra', 'year': '1993'},
 {'title': 'Poka Makorer Ghar Bosoti', 'year': '1996'},
 {'title': 'Maa Durga Shakti', 'year': '1999'},
 {'title': 'Conflict of Interest', 'year': '1993'}]

모든 ‘Macbeth’ 찾기

macbeth = [m for m in titles if m['title']=='Macbeth']
macbeth[:3]

[{'title': 'Macbeth', 'year': '1913'},
 {'title': 'Macbeth', 'year': '2006'},
 {'title': 'Macbeth', 'year': '2013'}]

itemgetter

연도별로 영화 정렬

from operator import itemgetter
sorted(macbeth, key=itemgetter('year'))[:3]

[{'title': 'Macbeth', 'year': '1913'},
 {'title': 'Macbeth', 'year': '1997'},
 {'title': 'Macbeth', 'year': '1998'}]

공백 0으로 채우기

casts = list(csv.DictReader(open('./pythondsp-pandasguide-b936c3b43406/data/cast.csv')))

casts[3:5]

[{'title': 'Secret in Their Eyes',
  'year': '2015',
  'name': '$hutter',
  'type': 'actor',
  'character': '2002 Dodger Fan',
  'n': ''},
 {'title': 'Steve Jobs',
  'year': '2015',
  'name': '$hutter',
  'type': 'actor',
  'character': '1988 Opera House Patron',
  'n': ''}]

0으로 대체

cast0 = [{**c, 'n':c['n'].replace('', '0')} for c in casts]
cast0[3:5]

[{'title': 'Secret in Their Eyes',
  'year': '2015',
  'name': '$hutter',
  'type': 'actor',
  'character': '2002 Dodger Fan',
  'n': '0'},
 {'title': 'Steve Jobs',
  'year': '2015',
  'name': '$hutter',
  'type': 'actor',
  'character': '1988 Opera House Patron',
  'n': '0'}]

Maa 로 시작하는 영화

maa = [m for m in titles if m['title'].startswith('Maa')]
maa[:3]

[{'title': 'Maa Durga Shakti', 'year': '1999'},
 {'title': 'Maarek hob', 'year': '2004'},
 {'title': 'Maa Aur Mamta', 'year': '1970'}]

collections.counter

연도별로 영화계산

from collections import Counter

by_year = Counter(t['year']for t in titles)
by_year.most_common(3)

[('2016', 2363), ('2017', 2138), ('2015', 1849)]

import matplotlib.pyplot as plt

data = by_year.most_common(len(titles))
data = sorted(data)
x = [c[0] for c in data]
y = [c[1] for c in data]
plt.plot(x, y)
plt.show()

png

collections.defaultdict

연도별로 dictionary에 영화 추가

from collections import defaultdict

d = defaultdict(list)
for row in titles:
    d[row['year']].append(row['title'])
    
xx=[]
yy=[]
for k, v in d.items():
    xx.append(k)
    yy.append(len(v))

plt.plot(sorted(xx), yy)
plt.show()

png

xx[:5]

['1990', '1969', '1993', '2000', '2011']

yy[:5]

[515, 465, 437, 616, 1457]

Aaron Abrams 의 모든 영화 보기

cf = [c for c in casts if c['name']=='Aaron Abrams']
cf[:3]

[{'title': '#FromJennifer',
  'year': '2017',
  'name': 'Aaron Abrams',
  'type': 'actor',
  'character': 'Ralph Sinclair',
  'n': ''},
 {'title': '388 Arletta Avenue',
  'year': '2011',
  'name': 'Aaron Abrams',
  'type': 'actor',
  'character': 'Alex',
  'n': '4'},
 {'title': 'Amelia',
  'year': '2009',
  'name': 'Aaron Abrams',
  'type': 'actor',
  'character': 'Slim Gordon',
  'n': '8'}]

dcf = defaultdict(list)
for row in cf:
    dcf[row['year']].append(row['title'])
    
dcf

defaultdict(list,
            {'2017': ['#FromJennifer', 'The Go-Getters'],
             '2011': ['388 Arletta Avenue',
              'Jesus Henry Christ',
              'Jesus Henry Christ',
              'Take This Waltz',
              'The Chicago 8'],
             '2009': ['Amelia', 'At Home by Myself... with You'],
             '2005': ['Cinderella Man', 'Sabah'],
             '2015': ['Closet Monster', 'Regression'],
             '2018': ['Code 8'],
             '2007': ['Firehouse Dog', 'Young People Fucking'],
             '2008': ['Flash of Genius'],
             '2013': ['It Was You Charlie'],
             '2004': ['Resident Evil: Apocalypse', 'Siblings'],
             '2003': ['The In-Laws', 'The Visual Bible: The Gospel of John'],
             '2006': ['Zoom']})

day 2

Pandas

day 2

Pandas 기초

Unstack

merge

Index

Multiple index

reset index

CSV library

read file

연도에 따라 영화 표시

itemgetter

공백 0으로 채우기

collections.counter

collections.defaultdict