#8-3 :: so

Python/Jupyter notebook 2023. 1. 16. 21:36

데이터 이산화

In [1]:

import pandas as pd

In [7]:

fta = pd.read_excel('./FTA무역통계_20230109000036.xls', header=4, index_col=1)

C:\ProgramData\Anaconda3\lib\site-packages\openpyxl\styles\stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

FTA 데이터에서 수출건수, 수출금액, 수입건수, 수입금액, 무역수지가
각각 평균 이상인 FTA와 평균 미만인 FTA로 이산화를 해 보자
이산화 데이터는 fta_bin 으로 저장한다

시간이 남는다면, 어떤 데이터를 가져와도 동작하는 표준 코드를 만들어보자

In [41]:

fta['수출건수'].apply(lambda x: int(x.replace(' ','').replace(',',''))).mean() # 띄어쓰기, 콤마 없애기

Out[41]:

884149.9444444445

In [9]:

fta.columns

Out[9]:

Index(['기간', '수출건수', '수출금액', '수입건수', '수입금액', '무역수지'], dtype='object')

In [11]:

target = fta.columns[1:]
fta_bin = pd.DataFrame()
df = fta

for c in target:
    col = df[c].apply(lambda x: int(x.replace(' ','').replace(',','')))
    fta_bin[c] = col > col.mean()

In [19]:

fta_bin # True, False 이진분류

Out[19]:

수출건수수출금액수입건수수입금액무역수지FTA명RCEP한-EFTA한-EU한-뉴질랜드한-미국한-베트남한-싱가포르한-아세안한-영국한-인도한-중국한-중미한-칠레한-캐나다한-콜롬비아한-터키한-페루한-호주

True	True	True	True	False
False	False	False	False	False
False	True	True	True	False
False	False	False	False	False
True	True	True	True	True
False	True	False	False	True
False	False	False	False	True
True	True	False	True	True
False	False	False	False	False
False	False	False	False	True
True	True	True	True	False
False	False	False	False	False
False	False	False	False	False
False	False	False	False	False
False	False	False	False	False
False	False	False	False	True
False	False	False	False	False
False	False	False	False	False

In [20]:

fta_bin.replace([True,False], [1,0])
# DataFrame에 replace 함수 사용해보기. 여러 값 입력할 때는 리스트

Out[20]:

1	1	1	1	0
0	0	0	0	0
0	1	1	1	0
0	0	0	0	0
1	1	1	1	1
0	1	0	0	1
0	0	0	0	1
1	1	0	1	1
0	0	0	0	0
0	0	0	0	1
1	1	1	1	0
0	0	0	0	0
0	0	0	0	0
0	0	0	0	0
0	0	0	0	0
0	0	0	0	1
0	0	0	0	0
0	0	0	0	0

데이터 축소

앞서 배운 내용을 기반으로 데이터 축소를 해 보자
단, 차원축소 제외

In [93]:

df2 = pd.read_csv('./서울시 부동산 실거래가 정보.csv', encoding='cp949')

C:\Users\user\AppData\Local\Temp\ipykernel_18480\496141101.py:1: DtypeWarning: Columns (7,19,20) have mixed types. Specify dtype option on import or set low_memory=False.
  df2 = pd.read_csv('./서울시 부동산 실거래가 정보.csv', encoding='cp949')

In [97]:

df2[['자치구코드','자치구명']]

Out[97]:

자치구코드자치구명01234...639995639996639997639998639999

11710	송파구
11140	중구
11260	중랑구
11740	강동구
11380	은평구
...	...
11320	도봉구
11290	성북구
11320	도봉구
11305	강북구
11380	은평구

640000 rows × 2 columns

In [67]:

df2['권리구분'].unique()

Out[67]:

array([nan, '입주권', '분양권'], dtype=object)

In [69]:

df2['신고구분'].unique()

Out[69]:

array(['중개거래', '직거래', nan], dtype=object)

In [72]:

df2[['자치구코드','자치구명']].drop_duplicates()
# 해당 데이터를 메타데이터로 저장하면 둘 중 하나를 제거할 수 있음

Out[72]:

자치구코드자치구명0123457810121415172021222527313334486070114

11710	송파구
11140	중구
11260	중랑구
11740	강동구
11380	은평구
11230	동대문구
11545	금천구
11200	성동구
11470	양천구
11410	서대문구
11320	도봉구
11305	강북구
11590	동작구
11620	관악구
11560	영등포구
11530	구로구
11650	서초구
11500	강서구
11215	광진구
11350	노원구
11680	강남구
11290	성북구
11170	용산구
11110	종로구
11440	마포구

In [73]:

df2[['법정동코드','법정동명']].drop_duplicates()

Out[73]:

법정동코드법정동명01234...567294581662599973601008617559

10600	삼전동
16100	저동2가
10300	중화동
10900	천호동
10800	역촌동
...	...
12600	명동1가
16600	서소문동
15900	초동
13000	남산동3가
11000	수표동

418 rows × 2 columns

In [74]:

df2[['지번구분','지번구분명']].drop_duplicates()

Out[74]:

지번구분지번구분명02812734639

1.0	대지
NaN	NaN
3.0	블럭
2.0	산

In [79]:

df2.drop(['접수연도','자치구명','법정동명','지번구분명'], inplace=True, axis='columns')

In [80]:

#df2.to_feather('./result.feather') feather 등의 I/O 부담이 덜한 확장자를 이용할 수 있음

Out[80]:

자치구코드법정동코드지번구분본번부번건물명계약일물건금액(만원)건물면적(㎡)토지면적(㎡)층권리구분취소일건축년도건물용도신고구분신고한 개업공인중개사 시군구명01234...639995639996639997639998639999

11710	10600	1.0	43.0	3.0	더트라이앵글	20230106	27000	16.59	12.15	3.0	NaN	NaN	2021.0	연립다세대	중개거래	서울 송파구
11140	16100	1.0	7.0	2.0	제이매크로 타워	20230106	20300	19.35	31.04	9.0	NaN	NaN	2018.0	오피스텔	중개거래	서울 중구
11260	10300	1.0	295.0	38.0	대일빌라	20230106	13500	44.00	27.58	2.0	NaN	NaN	1994.0	연립다세대	중개거래	서울 중랑구
11740	10900	1.0	425.0	3.0	천호역 한강 푸르지오시티	20230106	18800	25.24	35.91	28.0	NaN	NaN	2015.0	오피스텔	중개거래	서울 강동구
11380	10800	1.0	80.0	2.0	임창에버빌(80-2)	20230106	22500	40.48	15.96	6.0	NaN	NaN	2011.0	연립다세대	중개거래	서울 은평구
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
11320	10500	1.0	56.0	0.0	삼익세라믹	20190604	22000	42.93	NaN	8.0	NaN	NaN	1988.0	아파트	NaN	NaN
11290	13400	1.0	1278.0	0.0	길음동동부센트레빌(1278-0)	20190604	49800	60.00	NaN	8.0	NaN	NaN	2003.0	아파트	NaN	NaN
11320	10600	1.0	271.0	1.0	신동아아파트1	20190604	25000	53.16	NaN	15.0	NaN	NaN	1986.0	아파트	NaN	NaN
11305	10300	1.0	169.0	5.0	드림하이빌(169-5)	20190604	19500	43.88	27.20	5.0	NaN	NaN	2015.0	연립다세대	NaN	NaN
11380	10900	1.0	261.0	20.0	성락타운(261-20)	20190604	20250	49.35	45.72	1.0	NaN	NaN	1987.0	연립다세대	NaN	NaN

640000 rows × 17 columns

소수점 아래 자리수 알아보기

45.12와 45는 각각 실수와 정수라는 차이가 있다
45.12와 45는 .12라는 값의 차이가 있다
45.12와 45는 유효숫자의 자릿수 차이가 있다

소수점 아래 자리수가 있는지 없는지 체크하는 코드 만들어보기

In [98]:

round(45.12) # 반올림하는 함수

Out[98]:

In [99]:

int(45.12)

Out[99]:

In [100]:

type(45.12), type(45)

Out[100]:

(float, int)

In [102]:

str(45.12), str(45)

Out[102]:

('45.12', '45')

In [83]:

df2['물건금액(만원)']/100
# 유효숫자 변동이 없다면, 단위를 조절하는 의미가 크지 않다

Out[83]:

0         270.0
1         203.0
2         135.0
3         188.0
4         225.0
          ...  
639995    220.0
639996    498.0
639997    250.0
639998    195.0
639999    202.5
Name: 물건금액(만원), Length: 640000, dtype: float64

In [91]:

# 단위 변환시 소수점이 생기지 않는게 바람직함. 단, 원본 데이터가 소수점이 있는 경우 예외
price = df2['물건금액(만원)']/100
price_int = price.apply(lambda x: int(x))
(price == price_int).unique()

Out[91]:

array([ True, False])

고정소수점, 부동소수점 연산 시연

In [104]:

float(45.6565) # 부동소수점

Out[104]:

45.6565

In [108]:

float1 = 45.6565

In [110]:

'%.70f' %float1

Out[110]:

'45.6565000000000011937117960769683122634887695312500000000000000000000000'

In [105]:

import decimal # 고정소수점

In [127]:

decimal.Decimal(45) / decimal.Decimal(6)

Out[127]:

Decimal('7.5')

In [121]:

float3 = float(7.55)

In [126]:

'%.70f' %float3

Out[126]:

'7.5499999999999998223643160599749535322189331054687500000000000000000000'

차원축소

In [ ]:

# sklearn이 안깔렸을 때는
# 명령 프롬프트 창에 pip install sklearn

In [21]:

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [24]:

from sklearn.datasets import load_iris # 붓꽃 데이터

In [25]:

iris = load_iris()

In [28]:

iris

Out[28]:

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
        [5.5, 4.2, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.2],
        [5. , 3.2, 1.2, 0.2],
        [5.5, 3.5, 1.3, 0.2],
        [4.9, 3.6, 1.4, 0.1],
        [4.4, 3. , 1.3, 0.2],
        [5.1, 3.4, 1.5, 0.2],
        [5. , 3.5, 1.3, 0.3],
        [4.5, 2.3, 1.3, 0.3],
        [4.4, 3.2, 1.3, 0.2],
        [5. , 3.5, 1.6, 0.6],
        [5.1, 3.8, 1.9, 0.4],
        [4.8, 3. , 1.4, 0.3],
        [5.1, 3.8, 1.6, 0.2],
        [4.6, 3.2, 1.4, 0.2],
        [5.3, 3.7, 1.5, 0.2],
        [5. , 3.3, 1.4, 0.2],
        [7. , 3.2, 4.7, 1.4],
        [6.4, 3.2, 4.5, 1.5],
        [6.9, 3.1, 4.9, 1.5],
        [5.5, 2.3, 4. , 1.3],
        [6.5, 2.8, 4.6, 1.5],
        [5.7, 2.8, 4.5, 1.3],
        [6.3, 3.3, 4.7, 1.6],
        [4.9, 2.4, 3.3, 1. ],
        [6.6, 2.9, 4.6, 1.3],
        [5.2, 2.7, 3.9, 1.4],
        [5. , 2. , 3.5, 1. ],
        [5.9, 3. , 4.2, 1.5],
        [6. , 2.2, 4. , 1. ],
        [6.1, 2.9, 4.7, 1.4],
        [5.6, 2.9, 3.6, 1.3],
        [6.7, 3.1, 4.4, 1.4],
        [5.6, 3. , 4.5, 1.5],
        [5.8, 2.7, 4.1, 1. ],
        [6.2, 2.2, 4.5, 1.5],
        [5.6, 2.5, 3.9, 1.1],
        [5.9, 3.2, 4.8, 1.8],
        [6.1, 2.8, 4. , 1.3],
        [6.3, 2.5, 4.9, 1.5],
        [6.1, 2.8, 4.7, 1.2],
        [6.4, 2.9, 4.3, 1.3],
        [6.6, 3. , 4.4, 1.4],
        [6.8, 2.8, 4.8, 1.4],
        [6.7, 3. , 5. , 1.7],
        [6. , 2.9, 4.5, 1.5],
        [5.7, 2.6, 3.5, 1. ],
        [5.5, 2.4, 3.8, 1.1],
        [5.5, 2.4, 3.7, 1. ],
        [5.8, 2.7, 3.9, 1.2],
        [6. , 2.7, 5.1, 1.6],
        [5.4, 3. , 4.5, 1.5],
        [6. , 3.4, 4.5, 1.6],
        [6.7, 3.1, 4.7, 1.5],
        [6.3, 2.3, 4.4, 1.3],
        [5.6, 3. , 4.1, 1.3],
        [5.5, 2.5, 4. , 1.3],
        [5.5, 2.6, 4.4, 1.2],
        [6.1, 3. , 4.6, 1.4],
        [5.8, 2.6, 4. , 1.2],
        [5. , 2.3, 3.3, 1. ],
        [5.6, 2.7, 4.2, 1.3],
        [5.7, 3. , 4.2, 1.2],
        [5.7, 2.9, 4.2, 1.3],
        [6.2, 2.9, 4.3, 1.3],
        [5.1, 2.5, 3. , 1.1],
        [5.7, 2.8, 4.1, 1.3],
        [6.3, 3.3, 6. , 2.5],
        [5.8, 2.7, 5.1, 1.9],
        [7.1, 3. , 5.9, 2.1],
        [6.3, 2.9, 5.6, 1.8],
        [6.5, 3. , 5.8, 2.2],
        [7.6, 3. , 6.6, 2.1],
        [4.9, 2.5, 4.5, 1.7],
        [7.3, 2.9, 6.3, 1.8],
        [6.7, 2.5, 5.8, 1.8],
        [7.2, 3.6, 6.1, 2.5],
        [6.5, 3.2, 5.1, 2. ],
        [6.4, 2.7, 5.3, 1.9],
        [6.8, 3. , 5.5, 2.1],
        [5.7, 2.5, 5. , 2. ],
        [5.8, 2.8, 5.1, 2.4],
        [6.4, 3.2, 5.3, 2.3],
        [6.5, 3. , 5.5, 1.8],
        [7.7, 3.8, 6.7, 2.2],
        [7.7, 2.6, 6.9, 2.3],
        [6. , 2.2, 5. , 1.5],
        [6.9, 3.2, 5.7, 2.3],
        [5.6, 2.8, 4.9, 2. ],
        [7.7, 2.8, 6.7, 2. ],
        [6.3, 2.7, 4.9, 1.8],
        [6.7, 3.3, 5.7, 2.1],
        [7.2, 3.2, 6. , 1.8],
        [6.2, 2.8, 4.8, 1.8],
        [6.1, 3. , 4.9, 1.8],
        [6.4, 2.8, 5.6, 2.1],
        [7.2, 3. , 5.8, 1.6],
        [7.4, 2.8, 6.1, 1.9],
        [7.9, 3.8, 6.4, 2. ],
        [6.4, 2.8, 5.6, 2.2],
        [6.3, 2.8, 5.1, 1.5],
        [6.1, 2.6, 5.6, 1.4],
        [7.7, 3. , 6.1, 2.3],
        [6.3, 3.4, 5.6, 2.4],
        [6.4, 3.1, 5.5, 1.8],
        [6. , 3. , 4.8, 1.8],
        [6.9, 3.1, 5.4, 2.1],
        [6.7, 3.1, 5.6, 2.4],
        [6.9, 3.1, 5.1, 2.3],
        [5.8, 2.7, 5.1, 1.9],
        [6.8, 3.2, 5.9, 2.3],
        [6.7, 3.3, 5.7, 2.5],
        [6.7, 3. , 5.2, 2.3],
        [6.3, 2.5, 5. , 1.9],
        [6.5, 3. , 5.2, 2. ],
        [6.2, 3.4, 5.4, 2.3],
        [5.9, 3. , 5.1, 1.8]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
 'frame': None,
 'target_names': array(['setosa', 'versicolor', 'virginica'], dtype='<U10'),
 'DESCR': '.. _iris_dataset:\n\nIris plants dataset\n--------------------\n\n**Data Set Characteristics:**\n\n    :Number of Instances: 150 (50 in each of three classes)\n    :Number of Attributes: 4 numeric, predictive attributes and the class\n    :Attribute Information:\n        - sepal length in cm\n        - sepal width in cm\n        - petal length in cm\n        - petal width in cm\n        - class:\n                - Iris-Setosa\n                - Iris-Versicolour\n                - Iris-Virginica\n                \n    :Summary Statistics:\n\n    ============== ==== ==== ======= ===== ====================\n                    Min  Max   Mean    SD   Class Correlation\n    ============== ==== ==== ======= ===== ====================\n    sepal length:   4.3  7.9   5.84   0.83    0.7826\n    sepal width:    2.0  4.4   3.05   0.43   -0.4194\n    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)\n    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)\n    ============== ==== ==== ======= ===== ====================\n\n    :Missing Attribute Values: None\n    :Class Distribution: 33.3% for each of 3 classes.\n    :Creator: R.A. Fisher\n    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n    :Date: July, 1988\n\nThe famous Iris database, first used by Sir R.A. Fisher. The dataset is taken\nfrom Fisher\'s paper. Note that it\'s the same as in R, but not as in the UCI\nMachine Learning Repository, which has two wrong data points.\n\nThis is perhaps the best known database to be found in the\npattern recognition literature.  Fisher\'s paper is a classic in the field and\nis referenced frequently to this day.  (See Duda & Hart, for example.)  The\ndata set contains 3 classes of 50 instances each, where each class refers to a\ntype of iris plant.  One class is linearly separable from the other 2; the\nlatter are NOT linearly separable from each other.\n\n.. topic:: References\n\n   - Fisher, R.A. "The use of multiple measurements in taxonomic problems"\n     Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to\n     Mathematical Statistics" (John Wiley, NY, 1950).\n   - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.\n     (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.\n   - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System\n     Structure and Classification Rule for Recognition in Partially Exposed\n     Environments".  IEEE Transactions on Pattern Analysis and Machine\n     Intelligence, Vol. PAMI-2, No. 1, 67-71.\n   - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule".  IEEE Transactions\n     on Information Theory, May 1972, 431-433.\n   - See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al"s AUTOCLASS II\n     conceptual clustering system finds 3 classes in the data.\n   - Many, many more ...',
 'feature_names': ['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'],
 'filename': 'iris.csv',
 'data_module': 'sklearn.datasets.data'}

In [30]:

iris.data

Out[30]:

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.2],
       [5. , 3.2, 1.2, 0.2],
       [5.5, 3.5, 1.3, 0.2],
       [4.9, 3.6, 1.4, 0.1],
       [4.4, 3. , 1.3, 0.2],
       [5.1, 3.4, 1.5, 0.2],
       [5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4],
       [4.8, 3. , 1.4, 0.3],
       [5.1, 3.8, 1.6, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [5. , 3.3, 1.4, 0.2],
       [7. , 3.2, 4.7, 1.4],
       [6.4, 3.2, 4.5, 1.5],
       [6.9, 3.1, 4.9, 1.5],
       [5.5, 2.3, 4. , 1.3],
       [6.5, 2.8, 4.6, 1.5],
       [5.7, 2.8, 4.5, 1.3],
       [6.3, 3.3, 4.7, 1.6],
       [4.9, 2.4, 3.3, 1. ],
       [6.6, 2.9, 4.6, 1.3],
       [5.2, 2.7, 3.9, 1.4],
       [5. , 2. , 3.5, 1. ],
       [5.9, 3. , 4.2, 1.5],
       [6. , 2.2, 4. , 1. ],
       [6.1, 2.9, 4.7, 1.4],
       [5.6, 2.9, 3.6, 1.3],
       [6.7, 3.1, 4.4, 1.4],
       [5.6, 3. , 4.5, 1.5],
       [5.8, 2.7, 4.1, 1. ],
       [6.2, 2.2, 4.5, 1.5],
       [5.6, 2.5, 3.9, 1.1],
       [5.9, 3.2, 4.8, 1.8],
       [6.1, 2.8, 4. , 1.3],
       [6.3, 2.5, 4.9, 1.5],
       [6.1, 2.8, 4.7, 1.2],
       [6.4, 2.9, 4.3, 1.3],
       [6.6, 3. , 4.4, 1.4],
       [6.8, 2.8, 4.8, 1.4],
       [6.7, 3. , 5. , 1.7],
       [6. , 2.9, 4.5, 1.5],
       [5.7, 2.6, 3.5, 1. ],
       [5.5, 2.4, 3.8, 1.1],
       [5.5, 2.4, 3.7, 1. ],
       [5.8, 2.7, 3.9, 1.2],
       [6. , 2.7, 5.1, 1.6],
       [5.4, 3. , 4.5, 1.5],
       [6. , 3.4, 4.5, 1.6],
       [6.7, 3.1, 4.7, 1.5],
       [6.3, 2.3, 4.4, 1.3],
       [5.6, 3. , 4.1, 1.3],
       [5.5, 2.5, 4. , 1.3],
       [5.5, 2.6, 4.4, 1.2],
       [6.1, 3. , 4.6, 1.4],
       [5.8, 2.6, 4. , 1.2],
       [5. , 2.3, 3.3, 1. ],
       [5.6, 2.7, 4.2, 1.3],
       [5.7, 3. , 4.2, 1.2],
       [5.7, 2.9, 4.2, 1.3],
       [6.2, 2.9, 4.3, 1.3],
       [5.1, 2.5, 3. , 1.1],
       [5.7, 2.8, 4.1, 1.3],
       [6.3, 3.3, 6. , 2.5],
       [5.8, 2.7, 5.1, 1.9],
       [7.1, 3. , 5.9, 2.1],
       [6.3, 2.9, 5.6, 1.8],
       [6.5, 3. , 5.8, 2.2],
       [7.6, 3. , 6.6, 2.1],
       [4.9, 2.5, 4.5, 1.7],
       [7.3, 2.9, 6.3, 1.8],
       [6.7, 2.5, 5.8, 1.8],
       [7.2, 3.6, 6.1, 2.5],
       [6.5, 3.2, 5.1, 2. ],
       [6.4, 2.7, 5.3, 1.9],
       [6.8, 3. , 5.5, 2.1],
       [5.7, 2.5, 5. , 2. ],
       [5.8, 2.8, 5.1, 2.4],
       [6.4, 3.2, 5.3, 2.3],
       [6.5, 3. , 5.5, 1.8],
       [7.7, 3.8, 6.7, 2.2],
       [7.7, 2.6, 6.9, 2.3],
       [6. , 2.2, 5. , 1.5],
       [6.9, 3.2, 5.7, 2.3],
       [5.6, 2.8, 4.9, 2. ],
       [7.7, 2.8, 6.7, 2. ],
       [6.3, 2.7, 4.9, 1.8],
       [6.7, 3.3, 5.7, 2.1],
       [7.2, 3.2, 6. , 1.8],
       [6.2, 2.8, 4.8, 1.8],
       [6.1, 3. , 4.9, 1.8],
       [6.4, 2.8, 5.6, 2.1],
       [7.2, 3. , 5.8, 1.6],
       [7.4, 2.8, 6.1, 1.9],
       [7.9, 3.8, 6.4, 2. ],
       [6.4, 2.8, 5.6, 2.2],
       [6.3, 2.8, 5.1, 1.5],
       [6.1, 2.6, 5.6, 1.4],
       [7.7, 3. , 6.1, 2.3],
       [6.3, 3.4, 5.6, 2.4],
       [6.4, 3.1, 5.5, 1.8],
       [6. , 3. , 4.8, 1.8],
       [6.9, 3.1, 5.4, 2.1],
       [6.7, 3.1, 5.6, 2.4],
       [6.9, 3.1, 5.1, 2.3],
       [5.8, 2.7, 5.1, 1.9],
       [6.8, 3.2, 5.9, 2.3],
       [6.7, 3.3, 5.7, 2.5],
       [6.7, 3. , 5.2, 2.3],
       [6.3, 2.5, 5. , 1.9],
       [6.5, 3. , 5.2, 2. ],
       [6.2, 3.4, 5.4, 2.3],
       [5.9, 3. , 5.1, 1.8]])

In [32]:

print(iris.DESCR) # 메타데이터

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

    ============== ==== ==== ======= ===== ====================
                    Min  Max   Mean    SD   Class Correlation
    ============== ==== ==== ======= ===== ====================
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)
    ============== ==== ==== ======= ===== ====================

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
from Fisher's paper. Note that it's the same as in R, but not as in the UCI
Machine Learning Repository, which has two wrong data points.

This is perhaps the best known database to be found in the
pattern recognition literature.  Fisher's paper is a classic in the field and
is referenced frequently to this day.  (See Duda & Hart, for example.)  The
data set contains 3 classes of 50 instances each, where each class refers to a
type of iris plant.  One class is linearly separable from the other 2; the
latter are NOT linearly separable from each other.

.. topic:: References

   - Fisher, R.A. "The use of multiple measurements in taxonomic problems"
     Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to
     Mathematical Statistics" (John Wiley, NY, 1950).
   - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.
     (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.
   - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System
     Structure and Classification Rule for Recognition in Partially Exposed
     Environments".  IEEE Transactions on Pattern Analysis and Machine
     Intelligence, Vol. PAMI-2, No. 1, 67-71.
   - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule".  IEEE Transactions
     on Information Theory, May 1972, 431-433.
   - See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al"s AUTOCLASS II
     conceptual clustering system finds 3 classes in the data.
   - Many, many more ...

In [33]:

iris.feature_names

Out[33]:

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [63]:

iris.target

Out[63]:

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [58]:

scaler = StandardScaler()    
pca = PCA(n_components=3)
lda = LinearDiscriminantAnalysis(n_components=2)

In [59]:

scaler.fit(iris.data)
pca.fit(iris.data)

Out[59]:

PCA(n_components=3)

In [60]:

scaled = scaler.transform(iris.data)
reduced = pca.transform(iris.data)

In [64]:

lda.fit(scaled, iris.target)
iris_lda = lda.transform(scaled)

In [129]:

reduced.shape

Out[129]:

(150, 3)

In [131]:

iris_lda.shape

Out[131]:

(150, 2)

In [132]:

iris

Out[132]:

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
        [5.5, 4.2, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.2],
        [5. , 3.2, 1.2, 0.2],
        [5.5, 3.5, 1.3, 0.2],
        [4.9, 3.6, 1.4, 0.1],
        [4.4, 3. , 1.3, 0.2],
        [5.1, 3.4, 1.5, 0.2],
        [5. , 3.5, 1.3, 0.3],
        [4.5, 2.3, 1.3, 0.3],
        [4.4, 3.2, 1.3, 0.2],
        [5. , 3.5, 1.6, 0.6],
        [5.1, 3.8, 1.9, 0.4],
        [4.8, 3. , 1.4, 0.3],
        [5.1, 3.8, 1.6, 0.2],
        [4.6, 3.2, 1.4, 0.2],
        [5.3, 3.7, 1.5, 0.2],
        [5. , 3.3, 1.4, 0.2],
        [7. , 3.2, 4.7, 1.4],
        [6.4, 3.2, 4.5, 1.5],
        [6.9, 3.1, 4.9, 1.5],
        [5.5, 2.3, 4. , 1.3],
        [6.5, 2.8, 4.6, 1.5],
        [5.7, 2.8, 4.5, 1.3],
        [6.3, 3.3, 4.7, 1.6],
        [4.9, 2.4, 3.3, 1. ],
        [6.6, 2.9, 4.6, 1.3],
        [5.2, 2.7, 3.9, 1.4],
        [5. , 2. , 3.5, 1. ],
        [5.9, 3. , 4.2, 1.5],
        [6. , 2.2, 4. , 1. ],
        [6.1, 2.9, 4.7, 1.4],
        [5.6, 2.9, 3.6, 1.3],
        [6.7, 3.1, 4.4, 1.4],
        [5.6, 3. , 4.5, 1.5],
        [5.8, 2.7, 4.1, 1. ],
        [6.2, 2.2, 4.5, 1.5],
        [5.6, 2.5, 3.9, 1.1],
        [5.9, 3.2, 4.8, 1.8],
        [6.1, 2.8, 4. , 1.3],
        [6.3, 2.5, 4.9, 1.5],
        [6.1, 2.8, 4.7, 1.2],
        [6.4, 2.9, 4.3, 1.3],
        [6.6, 3. , 4.4, 1.4],
        [6.8, 2.8, 4.8, 1.4],
        [6.7, 3. , 5. , 1.7],
        [6. , 2.9, 4.5, 1.5],
        [5.7, 2.6, 3.5, 1. ],
        [5.5, 2.4, 3.8, 1.1],
        [5.5, 2.4, 3.7, 1. ],
        [5.8, 2.7, 3.9, 1.2],
        [6. , 2.7, 5.1, 1.6],
        [5.4, 3. , 4.5, 1.5],
        [6. , 3.4, 4.5, 1.6],
        [6.7, 3.1, 4.7, 1.5],
        [6.3, 2.3, 4.4, 1.3],
        [5.6, 3. , 4.1, 1.3],
        [5.5, 2.5, 4. , 1.3],
        [5.5, 2.6, 4.4, 1.2],
        [6.1, 3. , 4.6, 1.4],
        [5.8, 2.6, 4. , 1.2],
        [5. , 2.3, 3.3, 1. ],
        [5.6, 2.7, 4.2, 1.3],
        [5.7, 3. , 4.2, 1.2],
        [5.7, 2.9, 4.2, 1.3],
        [6.2, 2.9, 4.3, 1.3],
        [5.1, 2.5, 3. , 1.1],
        [5.7, 2.8, 4.1, 1.3],
        [6.3, 3.3, 6. , 2.5],
        [5.8, 2.7, 5.1, 1.9],
        [7.1, 3. , 5.9, 2.1],
        [6.3, 2.9, 5.6, 1.8],
        [6.5, 3. , 5.8, 2.2],
        [7.6, 3. , 6.6, 2.1],
        [4.9, 2.5, 4.5, 1.7],
        [7.3, 2.9, 6.3, 1.8],
        [6.7, 2.5, 5.8, 1.8],
        [7.2, 3.6, 6.1, 2.5],
        [6.5, 3.2, 5.1, 2. ],
        [6.4, 2.7, 5.3, 1.9],
        [6.8, 3. , 5.5, 2.1],
        [5.7, 2.5, 5. , 2. ],
        [5.8, 2.8, 5.1, 2.4],
        [6.4, 3.2, 5.3, 2.3],
        [6.5, 3. , 5.5, 1.8],
        [7.7, 3.8, 6.7, 2.2],
        [7.7, 2.6, 6.9, 2.3],
        [6. , 2.2, 5. , 1.5],
        [6.9, 3.2, 5.7, 2.3],
        [5.6, 2.8, 4.9, 2. ],
        [7.7, 2.8, 6.7, 2. ],
        [6.3, 2.7, 4.9, 1.8],
        [6.7, 3.3, 5.7, 2.1],
        [7.2, 3.2, 6. , 1.8],
        [6.2, 2.8, 4.8, 1.8],
        [6.1, 3. , 4.9, 1.8],
        [6.4, 2.8, 5.6, 2.1],
        [7.2, 3. , 5.8, 1.6],
        [7.4, 2.8, 6.1, 1.9],
        [7.9, 3.8, 6.4, 2. ],
        [6.4, 2.8, 5.6, 2.2],
        [6.3, 2.8, 5.1, 1.5],
        [6.1, 2.6, 5.6, 1.4],
        [7.7, 3. , 6.1, 2.3],
        [6.3, 3.4, 5.6, 2.4],
        [6.4, 3.1, 5.5, 1.8],
        [6. , 3. , 4.8, 1.8],
        [6.9, 3.1, 5.4, 2.1],
        [6.7, 3.1, 5.6, 2.4],
        [6.9, 3.1, 5.1, 2.3],
        [5.8, 2.7, 5.1, 1.9],
        [6.8, 3.2, 5.9, 2.3],
        [6.7, 3.3, 5.7, 2.5],
        [6.7, 3. , 5.2, 2.3],
        [6.3, 2.5, 5. , 1.9],
        [6.5, 3. , 5.2, 2. ],
        [6.2, 3.4, 5.4, 2.3],
        [5.9, 3. , 5.1, 1.8]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
 'frame': None,
 'target_names': array(['setosa', 'versicolor', 'virginica'], dtype='<U10'),
 'DESCR': '.. _iris_dataset:\n\nIris plants dataset\n--------------------\n\n**Data Set Characteristics:**\n\n    :Number of Instances: 150 (50 in each of three classes)\n    :Number of Attributes: 4 numeric, predictive attributes and the class\n    :Attribute Information:\n        - sepal length in cm\n        - sepal width in cm\n        - petal length in cm\n        - petal width in cm\n        - class:\n                - Iris-Setosa\n                - Iris-Versicolour\n                - Iris-Virginica\n                \n    :Summary Statistics:\n\n    ============== ==== ==== ======= ===== ====================\n                    Min  Max   Mean    SD   Class Correlation\n    ============== ==== ==== ======= ===== ====================\n    sepal length:   4.3  7.9   5.84   0.83    0.7826\n    sepal width:    2.0  4.4   3.05   0.43   -0.4194\n    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)\n    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)\n    ============== ==== ==== ======= ===== ====================\n\n    :Missing Attribute Values: None\n    :Class Distribution: 33.3% for each of 3 classes.\n    :Creator: R.A. Fisher\n    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n    :Date: July, 1988\n\nThe famous Iris database, first used by Sir R.A. Fisher. The dataset is taken\nfrom Fisher\'s paper. Note that it\'s the same as in R, but not as in the UCI\nMachine Learning Repository, which has two wrong data points.\n\nThis is perhaps the best known database to be found in the\npattern recognition literature.  Fisher\'s paper is a classic in the field and\nis referenced frequently to this day.  (See Duda & Hart, for example.)  The\ndata set contains 3 classes of 50 instances each, where each class refers to a\ntype of iris plant.  One class is linearly separable from the other 2; the\nlatter are NOT linearly separable from each other.\n\n.. topic:: References\n\n   - Fisher, R.A. "The use of multiple measurements in taxonomic problems"\n     Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to\n     Mathematical Statistics" (John Wiley, NY, 1950).\n   - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.\n     (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.\n   - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System\n     Structure and Classification Rule for Recognition in Partially Exposed\n     Environments".  IEEE Transactions on Pattern Analysis and Machine\n     Intelligence, Vol. PAMI-2, No. 1, 67-71.\n   - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule".  IEEE Transactions\n     on Information Theory, May 1972, 431-433.\n   - See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al"s AUTOCLASS II\n     conceptual clustering system finds 3 classes in the data.\n   - Many, many more ...',
 'feature_names': ['sepal length (cm)',
  'sepal width (cm)',
  'petal length (cm)',
  'petal width (cm)'],
 'filename': 'iris.csv',
 'data_module': 'sklearn.datasets.data'}

In [133]:

reduced

Out[133]:

array([[-2.68412563,  0.31939725, -0.02791483],
       [-2.71414169, -0.17700123, -0.21046427],
       [-2.88899057, -0.14494943,  0.01790026],
       [-2.74534286, -0.31829898,  0.03155937],
       [-2.72871654,  0.32675451,  0.09007924],
       [-2.28085963,  0.74133045,  0.16867766],
       [-2.82053775, -0.08946138,  0.25789216],
       [-2.62614497,  0.16338496, -0.02187932],
       [-2.88638273, -0.57831175,  0.02075957],
       [-2.6727558 , -0.11377425, -0.19763272],
       [-2.50694709,  0.6450689 , -0.07531801],
       [-2.61275523,  0.01472994,  0.10215026],
       [-2.78610927, -0.235112  , -0.20684443],
       [-3.22380374, -0.51139459,  0.06129967],
       [-2.64475039,  1.17876464, -0.15162752],
       [-2.38603903,  1.33806233,  0.2777769 ],
       [-2.62352788,  0.81067951,  0.13818323],
       [-2.64829671,  0.31184914,  0.02666832],
       [-2.19982032,  0.87283904, -0.12030552],
       [-2.5879864 ,  0.51356031,  0.21366517],
       [-2.31025622,  0.39134594, -0.23944404],
       [-2.54370523,  0.43299606,  0.20845723],
       [-3.21593942,  0.13346807,  0.29239675],
       [-2.30273318,  0.09870885,  0.03912326],
       [-2.35575405, -0.03728186,  0.12502108],
       [-2.50666891, -0.14601688, -0.25342004],
       [-2.46882007,  0.13095149,  0.09491058],
       [-2.56231991,  0.36771886, -0.07849421],
       [-2.63953472,  0.31203998, -0.1459089 ],
       [-2.63198939, -0.19696122,  0.04077108],
       [-2.58739848, -0.20431849, -0.07722299],
       [-2.4099325 ,  0.41092426, -0.14552497],
       [-2.64886233,  0.81336382,  0.22566915],
       [-2.59873675,  1.09314576,  0.15781081],
       [-2.63692688, -0.12132235, -0.14304958],
       [-2.86624165,  0.06936447, -0.16433231],
       [-2.62523805,  0.59937002, -0.26835038],
       [-2.80068412,  0.26864374,  0.09369908],
       [-2.98050204, -0.48795834,  0.07292705],
       [-2.59000631,  0.22904384, -0.0800823 ],
       [-2.77010243,  0.26352753,  0.07724769],
       [-2.84936871, -0.94096057, -0.34923038],
       [-2.99740655, -0.34192606,  0.19250921],
       [-2.40561449,  0.18887143,  0.26386795],
       [-2.20948924,  0.43666314,  0.29874275],
       [-2.71445143, -0.2502082 , -0.09767814],
       [-2.53814826,  0.50377114,  0.16670564],
       [-2.83946217, -0.22794557,  0.08372685],
       [-2.54308575,  0.57941002, -0.01711502],
       [-2.70335978,  0.10770608, -0.08929401],
       [ 1.28482569,  0.68516047, -0.40656803],
       [ 0.93248853,  0.31833364, -0.01801419],
       [ 1.46430232,  0.50426282, -0.33832576],
       [ 0.18331772, -0.82795901, -0.17959139],
       [ 1.08810326,  0.07459068, -0.3077579 ],
       [ 0.64166908, -0.41824687,  0.04107609],
       [ 1.09506066,  0.28346827,  0.16981024],
       [-0.74912267, -1.00489096,  0.01230292],
       [ 1.04413183,  0.2283619 , -0.41533608],
       [-0.0087454 , -0.72308191,  0.28114143],
       [-0.50784088, -1.26597119, -0.26981718],
       [ 0.51169856, -0.10398124,  0.13054775],
       [ 0.26497651, -0.55003646, -0.69414683],
       [ 0.98493451, -0.12481785, -0.06211441],
       [-0.17392537, -0.25485421,  0.09045769],
       [ 0.92786078,  0.46717949, -0.31462098],
       [ 0.66028376, -0.35296967,  0.32802753],
       [ 0.23610499, -0.33361077, -0.27116184],
       [ 0.94473373, -0.54314555, -0.49951905],
       [ 0.04522698, -0.58383438, -0.2350021 ],
       [ 1.11628318, -0.08461685,  0.45962099],
       [ 0.35788842, -0.06892503, -0.22985389],
       [ 1.29818388, -0.32778731, -0.34785435],
       [ 0.92172892, -0.18273779, -0.23107178],
       [ 0.71485333,  0.14905594, -0.32180094],
       [ 0.90017437,  0.32850447, -0.31620907],
       [ 1.33202444,  0.24444088, -0.52170278],
       [ 1.55780216,  0.26749545, -0.16492098],
       [ 0.81329065, -0.1633503 ,  0.0354245 ],
       [-0.30558378, -0.36826219, -0.31849158],
       [-0.06812649, -0.70517213, -0.24421381],
       [-0.18962247, -0.68028676, -0.30642056],
       [ 0.13642871, -0.31403244, -0.17724277],
       [ 1.38002644, -0.42095429,  0.01616713],
       [ 0.58800644, -0.48428742,  0.4444335 ],
       [ 0.80685831,  0.19418231,  0.38896306],
       [ 1.22069088,  0.40761959, -0.23716701],
       [ 0.81509524, -0.37203706, -0.61472084],
       [ 0.24595768, -0.2685244 ,  0.18836681],
       [ 0.16641322, -0.68192672, -0.06000923],
       [ 0.46480029, -0.67071154, -0.02430686],
       [ 0.8908152 , -0.03446444, -0.00994693],
       [ 0.23054802, -0.40438585, -0.22941024],
       [-0.70453176, -1.01224823, -0.10569115],
       [ 0.35698149, -0.50491009,  0.01661717],
       [ 0.33193448, -0.21265468,  0.08320429],
       [ 0.37621565, -0.29321893,  0.07799635],
       [ 0.64257601,  0.01773819, -0.20539497],
       [-0.90646986, -0.75609337, -0.01259965],
       [ 0.29900084, -0.34889781,  0.01058166],
       [ 2.53119273, -0.00984911,  0.76016543],
       [ 1.41523588, -0.57491635,  0.29632253],
       [ 2.61667602,  0.34390315, -0.11078788],
       [ 1.97153105, -0.1797279 ,  0.10842466],
       [ 2.35000592, -0.04026095,  0.28538956],
       [ 3.39703874,  0.55083667, -0.34843756],
       [ 0.52123224, -1.19275873,  0.5456593 ],
       [ 2.93258707,  0.3555    , -0.42023994],
       [ 2.32122882, -0.2438315 , -0.34830439],
       [ 2.91675097,  0.78279195,  0.42333542],
       [ 1.66177415,  0.24222841,  0.24244019],
       [ 1.80340195, -0.21563762, -0.03764817],
       [ 2.1655918 ,  0.21627559,  0.03332664],
       [ 1.34616358, -0.77681835,  0.28190288],
       [ 1.58592822, -0.53964071,  0.62902933],
       [ 1.90445637,  0.11925069,  0.47963982],
       [ 1.94968906,  0.04194326,  0.04418617],
       [ 3.48705536,  1.17573933,  0.13389487],
       [ 3.79564542,  0.25732297, -0.51376776],
       [ 1.30079171, -0.76114964, -0.34499504],
       [ 2.42781791,  0.37819601,  0.21911932],
       [ 1.19900111, -0.60609153,  0.51185551],
       [ 3.49992004,  0.4606741 , -0.57318224],
       [ 1.38876613, -0.20439933, -0.06452276],
       [ 2.2754305 ,  0.33499061,  0.28615009],
       [ 2.61409047,  0.56090136, -0.20553452],
       [ 1.25850816, -0.17970479,  0.0458477 ],
       [ 1.29113206, -0.11666865,  0.23125646],
       [ 2.12360872, -0.20972948,  0.15418002],
       [ 2.38800302,  0.4646398 , -0.44953019],
       [ 2.84167278,  0.37526917, -0.49889808],
       [ 3.23067366,  1.37416509, -0.11454821],
       [ 2.15943764, -0.21727758,  0.20876317],
       [ 1.44416124, -0.14341341, -0.15323389],
       [ 1.78129481, -0.49990168, -0.17287519],
       [ 3.07649993,  0.68808568, -0.33559229],
       [ 2.14424331,  0.1400642 ,  0.73487894],
       [ 1.90509815,  0.04930053,  0.16218024],
       [ 1.16932634, -0.16499026,  0.28183584],
       [ 2.10761114,  0.37228787,  0.02729113],
       [ 2.31415471,  0.18365128,  0.32269375],
       [ 1.9222678 ,  0.40920347,  0.1135866 ],
       [ 1.41523588, -0.57491635,  0.29632253],
       [ 2.56301338,  0.2778626 ,  0.29256952],
       [ 2.41874618,  0.3047982 ,  0.50448266],
       [ 1.94410979,  0.1875323 ,  0.17782509],
       [ 1.52716661, -0.37531698, -0.12189817],
       [ 1.76434572,  0.07885885,  0.13048163],
       [ 1.90094161,  0.11662796,  0.72325156],
       [ 1.39018886, -0.28266094,  0.36290965]])

In [134]:

iris_lda

Out[134]:

array([[ 8.06179978e+00,  3.00420621e-01],
       [ 7.12868772e+00, -7.86660426e-01],
       [ 7.48982797e+00, -2.65384488e-01],
       [ 6.81320057e+00, -6.70631068e-01],
       [ 8.13230933e+00,  5.14462530e-01],
       [ 7.70194674e+00,  1.46172097e+00],
       [ 7.21261762e+00,  3.55836209e-01],
       [ 7.60529355e+00, -1.16338380e-02],
       [ 6.56055159e+00, -1.01516362e+00],
       [ 7.34305989e+00, -9.47319209e-01],
       [ 8.39738652e+00,  6.47363392e-01],
       [ 7.21929685e+00, -1.09646389e-01],
       [ 7.32679599e+00, -1.07298943e+00],
       [ 7.57247066e+00, -8.05464137e-01],
       [ 9.84984300e+00,  1.58593698e+00],
       [ 9.15823890e+00,  2.73759647e+00],
       [ 8.58243141e+00,  1.83448945e+00],
       [ 7.78075375e+00,  5.84339407e-01],
       [ 8.07835876e+00,  9.68580703e-01],
       [ 8.02097451e+00,  1.14050366e+00],
       [ 7.49680227e+00, -1.88377220e-01],
       [ 7.58648117e+00,  1.20797032e+00],
       [ 8.68104293e+00,  8.77590154e-01],
       [ 6.25140358e+00,  4.39696367e-01],
       [ 6.55893336e+00, -3.89222752e-01],
       [ 6.77138315e+00, -9.70634453e-01],
       [ 6.82308032e+00,  4.63011612e-01],
       [ 7.92461638e+00,  2.09638715e-01],
       [ 7.99129024e+00,  8.63787128e-02],
       [ 6.82946447e+00, -5.44960851e-01],
       [ 6.75895493e+00, -7.59002759e-01],
       [ 7.37495254e+00,  5.65844592e-01],
       [ 9.12634625e+00,  1.22443267e+00],
       [ 9.46768199e+00,  1.82522635e+00],
       [ 7.06201386e+00, -6.63400423e-01],
       [ 7.95876243e+00, -1.64961722e-01],
       [ 8.61367201e+00,  4.03253602e-01],
       [ 8.33041759e+00,  2.28133530e-01],
       [ 6.93412007e+00, -7.05519379e-01],
       [ 7.68823131e+00, -9.22362309e-03],
       [ 7.91793715e+00,  6.75121313e-01],
       [ 5.66188065e+00, -1.93435524e+00],
       [ 7.24101468e+00, -2.72615132e-01],
       [ 6.41443556e+00,  1.24730131e+00],
       [ 6.85944381e+00,  1.05165396e+00],
       [ 6.76470393e+00, -5.05151855e-01],
       [ 8.08189937e+00,  7.63392750e-01],
       [ 7.18676904e+00, -3.60986823e-01],
       [ 8.31444876e+00,  6.44953177e-01],
       [ 7.67196741e+00, -1.34893840e-01],
       [-1.45927545e+00,  2.85437643e-02],
       [-1.79770574e+00,  4.84385502e-01],
       [-2.41694888e+00, -9.27840307e-02],
       [-2.26247349e+00, -1.58725251e+00],
       [-2.54867836e+00, -4.72204898e-01],
       [-2.42996725e+00, -9.66132066e-01],
       [-2.44848456e+00,  7.95961954e-01],
       [-2.22666513e-01, -1.58467318e+00],
       [-1.75020123e+00, -8.21180130e-01],
       [-1.95842242e+00, -3.51563753e-01],
       [-1.19376031e+00, -2.63445570e+00],
       [-1.85892567e+00,  3.19006544e-01],
       [-1.15809388e+00, -2.64340991e+00],
       [-2.66605725e+00, -6.42504540e-01],
       [-3.78367218e-01,  8.66389312e-02],
       [-1.20117255e+00,  8.44373592e-02],
       [-2.76810246e+00,  3.21995363e-02],
       [-7.76854039e-01, -1.65916185e+00],
       [-3.49805433e+00, -1.68495616e+00],
       [-1.09042788e+00, -1.62658350e+00],
       [-3.71589615e+00,  1.04451442e+00],
       [-9.97610366e-01, -4.90530602e-01],
       [-3.83525931e+00, -1.40595806e+00],
       [-2.25741249e+00, -1.42679423e+00],
       [-1.25571326e+00, -5.46424197e-01],
       [-1.43755762e+00, -1.34424979e-01],
       [-2.45906137e+00, -9.35277280e-01],
       [-3.51848495e+00,  1.60588866e-01],
       [-2.58979871e+00, -1.74611728e-01],
       [ 3.07487884e-01, -1.31887146e+00],
       [-1.10669179e+00, -1.75225371e+00],
       [-6.05524589e-01, -1.94298038e+00],
       [-8.98703769e-01, -9.04940034e-01],
       [-4.49846635e+00, -8.82749915e-01],
       [-2.93397799e+00,  2.73791065e-02],
       [-2.10360821e+00,  1.19156767e+00],
       [-2.14258208e+00,  8.87797815e-02],
       [-2.47945603e+00, -1.94073927e+00],
       [-1.32552574e+00, -1.62869550e-01],
       [-1.95557887e+00, -1.15434826e+00],
       [-2.40157020e+00, -1.59458341e+00],
       [-2.29248878e+00, -3.32860296e-01],
       [-1.27227224e+00, -1.21458428e+00],
       [-2.93176055e-01, -1.79871509e+00],
       [-2.00598883e+00, -9.05418042e-01],
       [-1.18166311e+00, -5.37570242e-01],
       [-1.61615645e+00, -4.70103580e-01],
       [-1.42158879e+00, -5.51244626e-01],
       [ 4.75973788e-01, -7.99905482e-01],
       [-1.54948259e+00, -5.93363582e-01],
       [-7.83947399e+00,  2.13973345e+00],
       [-5.50747997e+00, -3.58139892e-02],
       [-6.29200850e+00,  4.67175777e-01],
       [-5.60545633e+00, -3.40738058e-01],
       [-6.85055995e+00,  8.29825394e-01],
       [-7.41816784e+00, -1.73117995e-01],
       [-4.67799541e+00, -4.99095015e-01],
       [-6.31692685e+00, -9.68980756e-01],
       [-6.32773684e+00, -1.38328993e+00],
       [-6.85281335e+00,  2.71758963e+00],
       [-4.44072512e+00,  1.34723692e+00],
       [-5.45009572e+00, -2.07736942e-01],
       [-5.66033713e+00,  8.32713617e-01],
       [-5.95823722e+00, -9.40175447e-02],
       [-6.75926282e+00,  1.60023206e+00],
       [-5.80704331e+00,  2.01019882e+00],
       [-5.06601233e+00, -2.62733839e-02],
       [-6.60881882e+00,  1.75163587e+00],
       [-9.17147486e+00, -7.48255067e-01],
       [-4.76453569e+00, -2.15573720e+00],
       [-6.27283915e+00,  1.64948141e+00],
       [-5.36071189e+00,  6.46120732e-01],
       [-7.58119982e+00, -9.80722934e-01],
       [-4.37150279e+00, -1.21297458e-01],
       [-5.72317531e+00,  1.29327553e+00],
       [-5.27915920e+00, -4.24582377e-02],
       [-4.08087208e+00,  1.85936572e-01],
       [-4.07703640e+00,  5.23238483e-01],
       [-6.51910397e+00,  2.96976389e-01],
       [-4.58371942e+00, -8.56815813e-01],
       [-6.22824009e+00, -7.12719638e-01],
       [-5.22048773e+00,  1.46819509e+00],
       [-6.80015000e+00,  5.80895175e-01],
       [-3.81515972e+00, -9.42985932e-01],
       [-5.10748966e+00, -2.13059000e+00],
       [-6.79671631e+00,  8.63090395e-01],
       [-6.52449599e+00,  2.44503527e+00],
       [-4.99550279e+00,  1.87768525e-01],
       [-3.93985300e+00,  6.14020389e-01],
       [-5.20383090e+00,  1.14476808e+00],
       [-6.65308685e+00,  1.80531976e+00],
       [-5.10555946e+00,  1.99218201e+00],
       [-5.50747997e+00, -3.58139892e-02],
       [-6.79601924e+00,  1.46068695e+00],
       [-6.84735943e+00,  2.42895067e+00],
       [-5.64500346e+00,  1.67771734e+00],
       [-5.17956460e+00, -3.63475041e-01],
       [-4.96774090e+00,  8.21140550e-01],
       [-5.88614539e+00,  2.34509051e+00],
       [-4.68315426e+00,  3.32033811e-01]])

In [ ]:

# scaling: 통계적인 이점 = 같은 스케일로 값을 비교할 수 있다는 이점
# 그리고, ML 모델에 적용할 수 있는 이점이 있다

In [135]:

from sklearn.preprocessing import MinMaxScaler

In [137]:

data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]

In [146]:

df = pd.DataFrame(data)
df

Out[146]:

010123

-1.0	2
-0.5	6
0.0	10
1.0	18

In [144]:

pd.DataFrame(scaler.transform(data))

Out[144]:

010123

0.00	0.00
0.25	0.25
0.50	0.50
1.00	1.00

In [138]:

scaler = MinMaxScaler()

In [139]:

scaler.fit(data)

Out[139]:

MinMaxScaler()

In [140]:

scaler.transform(data)
# 각각의 scale이 어떤 기준으로 할당되는가? -> DataFrame 기준, 각 column 단위로 scaling
# 왜 [1,18]이 [1,1]로 저장되는가? -> scaling 기준이 서로 다르기 때문

Out[140]:

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [142]:

scaler.transform([[4,5]])

Out[142]:

array([[2.5   , 0.1875]])

In [3]:

import pandas as pd
from scipy import stats

In [82]:

df = pd.read_excel('./FTA무역통계_20230109000036.xls', header=4, index_col=1)

C:\ProgramData\Anaconda3\lib\site-packages\openpyxl\styles\stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

In [5]:

df

Out[5]:

기간수출건수수출금액수입건수수입금액무역수지FTA명RCEP한-EFTA한-EU한-뉴질랜드한-미국한-베트남한-싱가포르한-아세안한-영국한-인도한-중국한-중미한-칠레한-캐나다한-콜롬비아한-터키한-페루한-호주

2022	6,474,383	287,564,087	11,944,545	292,113,146	-4,549,059
2022	26,912	1,794,418	75,489	4,810,598	-3,016,180
2022	882,224	67,684,624	4,734,145	66,716,800	967,825
2022	36,305	3,139,637	455,563	1,743,951	1,395,687
2022	2,186,357	100,446,018	18,531,680	74,972,585	25,473,433
2022	582,908	56,212,924	860,372	24,850,108	31,362,815
2022	749,118	18,696,289	151,934	9,315,582	9,380,706
2022	2,775,570	115,599,623	1,839,987	75,744,022	39,855,602
2022	108,368	5,862,160	386,507	5,374,130	488,030
2022	193,395	17,391,372	182,655	8,403,685	8,987,687
2022	1,325,620	144,611,224	6,775,116	142,753,918	1,857,306
2022	28,606	1,388,730	10,003	933,363	455,367
2022	30,137	1,425,386	16,553	6,264,483	-4,839,097
2022	212,405	7,224,014	329,449	7,966,998	-742,985
2022	14,045	862,924	11,286	636,784	226,140
2022	82,688	7,059,423	46,380	1,287,251	5,772,172
2022	13,929	721,030	10,024	2,734,129	-2,013,099
2022	191,729	17,396,124	560,089	41,235,660	-23,839,536

In [6]:

df['수출건수'][0]

Out[6]:

'           6,474,383'

In [83]:

df['수출건수'] = [int(i.replace(' ','').replace(',','')) for i in df['수출건수']]
df['수입건수'] = [int(i.replace(' ','').replace(',','')) for i in df['수입건수']]

In [8]:

df

Out[8]:

2022	6474383	287,564,087	11944545	292,113,146	-4,549,059
2022	26912	1,794,418	75489	4,810,598	-3,016,180
2022	882224	67,684,624	4734145	66,716,800	967,825
2022	36305	3,139,637	455563	1,743,951	1,395,687
2022	2186357	100,446,018	18531680	74,972,585	25,473,433
2022	582908	56,212,924	860372	24,850,108	31,362,815
2022	749118	18,696,289	151934	9,315,582	9,380,706
2022	2775570	115,599,623	1839987	75,744,022	39,855,602
2022	108368	5,862,160	386507	5,374,130	488,030
2022	193395	17,391,372	182655	8,403,685	8,987,687
2022	1325620	144,611,224	6775116	142,753,918	1,857,306
2022	28606	1,388,730	10003	933,363	455,367
2022	30137	1,425,386	16553	6,264,483	-4,839,097
2022	212405	7,224,014	329449	7,966,998	-742,985
2022	14045	862,924	11286	636,784	226,140
2022	82688	7,059,423	46380	1,287,251	5,772,172
2022	13929	721,030	10024	2,734,129	-2,013,099
2022	191729	17,396,124	560089	41,235,660	-23,839,536

In [9]:

df['수출건수'], df['수입건수']

Out[9]:

(FTA명
 RCEP      6474383
 한-EFTA      26912
 한-EU       882224
 한-뉴질랜드      36305
 한-미국      2186357
 한-베트남      582908
 한-싱가포르     749118
 한-아세안     2775570
 한-영국       108368
 한-인도       193395
 한-중국      1325620
 한-중미        28606
 한-칠레        30137
 한-캐나다      212405
 한-콜롬비아      14045
 한-터키        82688
 한-페루        13929
 한-호주       191729
 Name: 수출건수, dtype: int64,
 FTA명
 RCEP      11944545
 한-EFTA       75489
 한-EU       4734145
 한-뉴질랜드      455563
 한-미국      18531680
 한-베트남       860372
 한-싱가포르      151934
 한-아세안      1839987
 한-영국        386507
 한-인도        182655
 한-중국       6775116
 한-중미         10003
 한-칠레         16553
 한-캐나다       329449
 한-콜롬비아       11286
 한-터키         46380
 한-페루         10024
 한-호주        560089
 Name: 수입건수, dtype: int64)

In [10]:

# ttest_ind = T-test = Student's T-test
stats.ttest_ind(df['수출건수'],df['수입건수'])

Out[10]:

Ttest_indResult(statistic=-1.3740039439389153, pvalue=0.17843151065870033)

0113 추가 실습

위 T-test를 다시 실행한다
단, 데이터 전처리와 transformation을 시도해 본다

In [31]:

import math
import numpy as np
import seaborn as sns

In [37]:

sns.distplot(df['수출건수']) # 분포 확인

Out[37]:

<AxesSubplot:xlabel='수출건수', ylabel='Density'>

C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 49688 (\N{HANGUL SYLLABLE SU}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 52636 (\N{HANGUL SYLLABLE CUL}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44148 (\N{HANGUL SYLLABLE GEON}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

In [ ]:

np.exp(df['수출건수']) # 지수 변환

In [ ]:

np.log2(df['수출건수']) # 로그 변환

In [30]:

df['수출건수'].apply(lambda x: math.log2(x))

Out[30]:

FTA명
RCEP      4.499929
한-EFTA    3.879310
한-EU      4.303838
한-뉴질랜드    3.921044
한-미국      4.396440
한-베트남     4.259492
한-싱가포르    4.286499
한-아세안     4.419832
한-영국      4.063984
한-인도      4.134319
한-중국      4.346123
한-중미      3.887918
한-칠레      3.895230
한-캐나다     4.145389
한-콜롬비아    3.784270
한-터키      4.029929
한-페루      3.783017
한-호주      4.133293
Name: 수출건수, dtype: float64

In [ ]:

In [11]:

stats.ttest_ind(df['수출건수'],df['수입건수'])

Out[11]:

Ttest_indResult(statistic=-1.3740039439389153, pvalue=0.17843151065870033)

In [12]:

stats.shapiro(df['수출건수'])

Out[12]:

ShapiroResult(statistic=0.6007808446884155, pvalue=6.888485586387105e-06)

In [13]:

stats.shapiro(df['수입건수'])

Out[13]:

ShapiroResult(statistic=0.5887835025787354, pvalue=5.249044534139102e-06)

In [20]:

import seaborn as sns
import math
import numpy as np

In [49]:

sns.distplot(df['수입건수'].apply(lambda x: math.log2(x)))

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Out[49]:

<AxesSubplot:xlabel='수입건수', ylabel='Density'>

C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 49688 (\N{HANGUL SYLLABLE SU}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 51077 (\N{HANGUL SYLLABLE IB}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44148 (\N{HANGUL SYLLABLE GEON}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

In [21]:

sns.distplot(np.log2(df['수입건수']))

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Out[21]:

<AxesSubplot:xlabel='수입건수', ylabel='Density'>

C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 49688 (\N{HANGUL SYLLABLE SU}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 51077 (\N{HANGUL SYLLABLE IB}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44148 (\N{HANGUL SYLLABLE GEON}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

In [50]:

stats.shapiro(df['수출건수'].apply(lambda x: math.log2(x)))

Out[50]:

ShapiroResult(statistic=0.944760262966156, pvalue=0.3488025963306427)

In [51]:

stats.shapiro(np.log2(df['수입건수']))

Out[51]:

ShapiroResult(statistic=0.9464555978775024, pvalue=0.37215086817741394)

In [52]:

# log2 transformation 기능을 수행하는 함수
df['수출건수'] = df['수출건수'].apply(lambda x: math.log2(x))
df['수입건수'] = np.log2(df['수입건수'])

In [53]:

# KS test: 두 그룹이 같은 분산성을 가지는 모집단에서 추출되었는지 확인
# 귀무가설: 같은 분산성을 가지는 모집단에서 추출되었다
# 대립가설: 같은 분산성을 가지는 모집단에서 추출되지 않았다
stats.ks_2samp(df['수출건수'], df['수입건수'])

Out[53]:

KstestResult(statistic=0.16666666666666666, pvalue=0.9715397823325015)

In [55]:

# levene test: 두 그룹이 등분산성인지 확인
# 귀무가설: 두 그룹이 등분산성이다
# 대립가설: 두 그룹이 등분산성이 아니다 = 이분산성이다
stats.levene(df['수출건수'], df['수입건수'])

Out[55]:

LeveneResult(statistic=0.7822644136954173, pvalue=0.3826615588571587)

In [56]:

# T-test (등분산성)
# Student’s T-test의 귀무가설: 두 그룹은 같은 모평균을 가진다
stats.ttest_ind(df['수출건수'], df['수입건수'], equal_var=True)

Out[56]:

Ttest_indResult(statistic=-0.6030815594484242, pvalue=0.5504565242174382)

In [84]:

# 정규성을 만족하지 않을 때, 정규성을 가지도록 변환할 것인가 비모수적인 방법을 이용할 것인가
# 변환의 정도가 크지 않다면 전자, 크다면 후자를 이용한다
# 변환의 정도는 주관적으로 판단한다
stats.mannwhitneyu(df['수출건수'], df['수입건수'])

Out[84]:

MannwhitneyuResult(statistic=151.0, pvalue=0.7397342431142654)

T-test 절차

전처리
정규성 검정 (Shapiro-Wilk test, p < 0.05일 경우 정규성이 없는 것으로 볼 수 있다)
정규성이 없을 경우 -> transformation을 시도한다 (해당 값을 X로 가정한다)
X에 대해 등분산성 검정 (Levene test, p < 0.05일 경우 이분산성으로 볼 수 있다)
X에 대해 적절한 T-test 기법 선택 후 시행 (p < 0.05일 경우 두 그룹의 모평균이 다르다고 볼 수 있다

scale이 지나치게 크거나 작을 경우, 조절해 주는 것을 권장

실습

아래 데이터를 이용해 통계분석을 해 본다
물건금액 column을 이용
특정 구역과 다른 구역 사이의 관계 또는 특성을 알아본다
통계분석에 사용되는 데이터는 모두 전처리를 한다
샘플 추출이 필요한 경우, 무작위로 추출한다

In [89]:

import random

In [ ]:

# 랜덤 샘플링을 위한 기능
# np.random.randint()
# random 패키지

In [85]:

df = pd.read_csv('./서울시 부동산 실거래가 정보.csv', encoding='cp949')

C:\Users\user\AppData\Local\Temp\ipykernel_13608\1027994529.py:1: DtypeWarning: Columns (7,19,20) have mixed types. Specify dtype option on import or set low_memory=False.
  df = pd.read_csv('./서울시 부동산 실거래가 정보.csv', encoding='cp949')

In [88]:

df.describe()['물건금액(만원)']

Out[88]:

count    6.400000e+05
mean     6.121905e+04
std      8.540087e+04
min      1.700000e+03
25%      2.300000e+04
50%      3.650000e+04
75%      7.300000e+04
max      1.108778e+07
Name: 물건금액(만원), dtype: float64

In [90]:

sns.distplot(df['물건금액(만원)'])

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Out[90]:

<AxesSubplot:xlabel='물건금액(만원)', ylabel='Density'>

C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 47932 (\N{HANGUL SYLLABLE MUL}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44148 (\N{HANGUL SYLLABLE GEON}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44552 (\N{HANGUL SYLLABLE GEUM}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 50529 (\N{HANGUL SYLLABLE AEG}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 47564 (\N{HANGUL SYLLABLE MAN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 50896 (\N{HANGUL SYLLABLE WEON}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

In [91]:

sns.distplot(np.log2(df['물건금액(만원)']))

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Out[91]:

<AxesSubplot:xlabel='물건금액(만원)', ylabel='Density'>

C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 47932 (\N{HANGUL SYLLABLE MUL}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44148 (\N{HANGUL SYLLABLE GEON}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44552 (\N{HANGUL SYLLABLE GEUM}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 50529 (\N{HANGUL SYLLABLE AEG}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 47564 (\N{HANGUL SYLLABLE MAN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 50896 (\N{HANGUL SYLLABLE WEON}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

In [92]:

df['물건금액(만원)'].sort_values()

Out[92]:

469341        1700
537486        2000
450513        2000
452996        2000
586391        2000
            ...   
213608    11087780
191607    11087780
213454    11087780
191588    11087780
191967    11087780
Name: 물건금액(만원), Length: 640000, dtype: int64

In [94]:

df['물건금액(만원)'].sort_values()[-10:]

Out[94]:

224778     3000000
226055     3000000
213821    11087780
191319    11087780
213661    11087780
213608    11087780
191607    11087780
213454    11087780
191588    11087780
191967    11087780
Name: 물건금액(만원), dtype: int64

In [95]:

stats.shapiro(np.log2(df['물건금액(만원)']))

C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\_morestats.py:1800: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

Out[95]:

ShapiroResult(statistic=0.9840511679649353, pvalue=0.0)

In [140]:

# sample 개수가 많아 정규성을 만족한다고 가정할지
# log2 transformation을 하는 것으로 결정할지 자의적으로 판단
df['물건금액(만원)'] = np.log2(df['물건금액(만원)'])

In [141]:

df.shape

Out[141]:

(640000, 21)

In [123]:

data1 = pd.DataFrame()
data2 = pd.DataFrame()

In [124]:

for i in range(200):
    r = np.random.randint(0,df.shape[0])
    r2 = np.random.randint(0,df.shape[0])
    data1[i] = df.loc[r]
    data2[i] = df.loc[r2]

In [125]:

data1 = data1.T
data2 = data2.T

In [142]:

stats.shapiro(data1['물건금액(만원)'])

Out[142]:

ShapiroResult(statistic=0.9806092977523804, pvalue=0.007261964492499828)

In [143]:

sns.distplot(data1['물건금액(만원)'])

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Out[143]:

<AxesSubplot:xlabel='물건금액(만원)', ylabel='Density'>

C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 47932 (\N{HANGUL SYLLABLE MUL}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44148 (\N{HANGUL SYLLABLE GEON}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44552 (\N{HANGUL SYLLABLE GEUM}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 50529 (\N{HANGUL SYLLABLE AEG}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 47564 (\N{HANGUL SYLLABLE MAN}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 50896 (\N{HANGUL SYLLABLE WEON}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

In [128]:

stats.levene(data1['물건금액(만원)'], data2['물건금액(만원)'])

Out[128]:

LeveneResult(statistic=0.038709411342177195, pvalue=0.8441258974314072)

In [135]:

stats.ttest_ind(data1['물건금액(만원)'].apply(lambda x: float(x)), data2['물건금액(만원)'].apply(lambda x: float(x)))

Out[135]:

Ttest_indResult(statistic=-0.3327522991505876, pvalue=0.739496466909871)

In [ ]:

In [86]:

df

Out[86]:

접수연도자치구코드자치구명법정동코드법정동명지번구분지번구분명본번부번건물명...물건금액(만원)건물면적(㎡)토지면적(㎡)층권리구분취소일건축년도건물용도신고구분신고한 개업공인중개사 시군구명01234...639995639996639997639998639999

2023	11710	송파구	10600	삼전동	1.0	대지	43.0	3.0	더트라이앵글	...	27000	16.59	12.15	3.0	NaN	NaN	2021.0	연립다세대	중개거래	서울 송파구
2023	11140	중구	16100	저동2가	1.0	대지	7.0	2.0	제이매크로 타워	...	20300	19.35	31.04	9.0	NaN	NaN	2018.0	오피스텔	중개거래	서울 중구
2023	11260	중랑구	10300	중화동	1.0	대지	295.0	38.0	대일빌라	...	13500	44.00	27.58	2.0	NaN	NaN	1994.0	연립다세대	중개거래	서울 중랑구
2023	11740	강동구	10900	천호동	1.0	대지	425.0	3.0	천호역 한강 푸르지오시티	...	18800	25.24	35.91	28.0	NaN	NaN	2015.0	오피스텔	중개거래	서울 강동구
2023	11380	은평구	10800	역촌동	1.0	대지	80.0	2.0	임창에버빌(80-2)	...	22500	40.48	15.96	6.0	NaN	NaN	2011.0	연립다세대	중개거래	서울 은평구
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2019	11320	도봉구	10500	쌍문동	1.0	대지	56.0	0.0	삼익세라믹	...	22000	42.93	NaN	8.0	NaN	NaN	1988.0	아파트	NaN	NaN
2019	11290	성북구	13400	길음동	1.0	대지	1278.0	0.0	길음동동부센트레빌(1278-0)	...	49800	60.00	NaN	8.0	NaN	NaN	2003.0	아파트	NaN	NaN
2019	11320	도봉구	10600	방학동	1.0	대지	271.0	1.0	신동아아파트1	...	25000	53.16	NaN	15.0	NaN	NaN	1986.0	아파트	NaN	NaN
2019	11305	강북구	10300	수유동	1.0	대지	169.0	5.0	드림하이빌(169-5)	...	19500	43.88	27.20	5.0	NaN	NaN	2015.0	연립다세대	NaN	NaN
2019	11380	은평구	10900	신사동	1.0	대지	261.0	20.0	성락타운(261-20)	...	20250	49.35	45.72	1.0	NaN	NaN	1987.0	연립다세대	NaN	NaN

640000 rows × 21 columns

In [77]:

math.log2(10)

Out[77]:

3.321928094887362

In [78]:

math.log2(1000000000000)

Out[78]:

39.86313713864835

In [79]:

math.log2(10000000000000)

Out[79]:

43.18506523353571

In [ ]:

p-value 표기법

p < 0.05 == *
p < 0.01 == **
p < 0.001 == ***
p < 0.0001 == **
p >= 0.05 == ns

In [188]:

# 정규성 검정
# 귀무가설: 정규성을 만족하는 모집단에서 나왔다
# p < 0.05 이므로 정규성을 만족하지 않는다 라고 볼 수 있음
stats.shapiro(df['수출건수'])

Out[188]:

ShapiroResult(statistic=0.6007808446884155, pvalue=6.888485586387105e-06)

In [189]:

import seaborn as sns

In [190]:

sns.distplot(df['수출건수'])

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Out[190]:

<AxesSubplot:xlabel='수출건수', ylabel='Density'>

C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 49688 (\N{HANGUL SYLLABLE SU}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 52636 (\N{HANGUL SYLLABLE CUL}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44148 (\N{HANGUL SYLLABLE GEON}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

In [200]:

# 귀무가설: 두 그룹의 차이가 나지 않는다
# p >= 0.05 이기 때문에 두 그룹 간의 차이가 나지 않는다고 볼 수 있음
stats.wilcoxon(df['수출건수'],df['수입건수'])

Out[200]:

WilcoxonResult(statistic=48.0, pvalue=0.1083831787109375)

In [201]:

# KS test
# 귀무가설: 같은 분산을 가진 표본에서 추출한 것이다 (등분산성이라고 가정할 수 있음)
# p >= 0.05 이기 때문에 등분산성임을 확인
stats.ks_2samp(df['수출건수'],df['수입건수'])

Out[201]:

KstestResult(statistic=0.16666666666666666, pvalue=0.9715397823325015)

In [202]:

# Welch’s T-test
# Student's T-test에서 equal_var 옵션을 False 로 설정하면 됩니다
stats.ttest_ind(df['수출건수'],df['수입건수'], equal_var=False)

Out[202]:

Ttest_indResult(statistic=-1.3740039439389151, pvalue=0.18436372922684297)

실습

아래의 데이터에서 각각 T-test를 진행한다
T-test를 진행할 때, 결과가 특정 주장을 할 수 있어야 합니다
FTA 데이터 예시) A/B 그룹의 수출금액이 서로 다른지 비교할 경우, 그 원인을 찾아 주장할 수 있다

In [236]:

df1 = pd.read_excel('./FTA무역통계_20230109000036.xls', index_col=1, header=4)

C:\ProgramData\Anaconda3\lib\site-packages\openpyxl\styles\stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default
  warn("Workbook contains no default style, apply openpyxl's default")

In [206]:

df2 = pd.read_csv('./서울시 부동산 실거래가 정보.csv', encoding='cp949')

C:\Users\user\AppData\Local\Temp\ipykernel_18480\496141101.py:1: DtypeWarning: Columns (7,19,20) have mixed types. Specify dtype option on import or set low_memory=False.
  df2 = pd.read_csv('./서울시 부동산 실거래가 정보.csv', encoding='cp949')

In [237]:

유럽 = ['한-EFTA','한-EU','한-영국']
아시아 = ['RCEP','한-베트남','한-싱가포르','한-아세안','한-인도','한-중국']
아메리카 = ['한-미국','한-중미','한-칠레','한-캐나다','한-콜롬비아','한-페루']
서아시아 = ['한-터키']
오세아니아 = ['한-호주','한-뉴질랜드']

In [238]:

df1['수입금액'] = [int(i.replace(' ','').replace(',','')) for i in df1['수입금액']]

In [239]:

df1.loc[유럽]

Out[239]:

기간수출건수수출금액수입건수수입금액무역수지FTA명한-EFTA한-EU한-영국

2022	26,912	1,794,418	75,489	4810598	-3,016,180
2022	882,224	67,684,624	4,734,145	66716800	967,825
2022	108,368	5,862,160	386,507	5374130	488,030

In [240]:

In [ ]:

# 수입금액을 아시아-아메리카 분류로 비교

In [241]:

stats.shapiro(df1['수입금액'])

Out[241]:

ShapiroResult(statistic=0.6233973503112793, pvalue=1.1650753549474757e-05)

In [242]:

import matplotlib.pyplot as plt
stats.probplot(df1['수입금액'], plot=plt)

Out[242]:

((array([-1.77709673, -1.33087857, -1.05345661, -0.83977496, -0.6589352 ,
         -0.49749478, -0.34812942, -0.20618578, -0.06829777,  0.06829777,
          0.20618578,  0.34812942,  0.49749478,  0.6589352 ,  0.83977496,
          1.05345661,  1.33087857,  1.77709673]),
  array([   636784,    933363,   1287251,   1743951,   2734129,   4810598,
           5374130,   6264483,   7966998,   8403685,   9315582,  24850108,
          41235660,  66716800,  74972585,  75744022, 142753918, 292113146],
        dtype=int64)),
 (60201229.04824289, 42658732.94444444, 0.7760322824492758))

In [243]:

sns.distplot(df1['수입금액'])

C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)

Out[243]:

<AxesSubplot:xlabel='수입금액', ylabel='Density'>

C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 49688 (\N{HANGUL SYLLABLE SU}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 51077 (\N{HANGUL SYLLABLE IB}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44552 (\N{HANGUL SYLLABLE GEUM}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 50529 (\N{HANGUL SYLLABLE AEG}) missing from current font.
  fig.canvas.print_figure(bytes_io, **kw)

In [248]:

stats.ks_2samp(df1.loc[아시아,'수입금액'],df1.loc[아메리카,'수입금액'])

Out[248]:

KstestResult(statistic=0.8333333333333334, pvalue=0.025974025974025972)

In [249]:

stats.wilcoxon(df1.loc[아시아,'수입금액'],df1.loc[아메리카,'수입금액'])

Out[249]:

WilcoxonResult(statistic=0.0, pvalue=0.03125)

In [250]:

stats.mannwhitneyu(df1.loc[아시아,'수입금액'],df1.loc[아메리카,'수입금액'])

Out[250]:

MannwhitneyuResult(statistic=33.0, pvalue=0.015151515151515152)

In [252]:

df1.loc[아시아,'수입금액'].mean() > df1.loc[아메리카,'수입금액'].mean()

Out[252]:

True

In [270]:

df2[['건물면적(㎡)','토지면적(㎡)']]

Out[270]:

건물면적(㎡)토지면적(㎡)01234...639995639996639997639998639999

16.59	12.150000
19.35	31.040000
44.00	27.580000
25.24	35.910000
40.48	15.960000
...	...
42.93	35.520121
60.00	35.520121
53.16	35.520121
43.88	27.200000
49.35	45.720000

640000 rows × 2 columns

In [271]:

stats.shapiro(df2['건물면적(㎡)'])

C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\_morestats.py:1800: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

Out[271]:

ShapiroResult(statistic=0.4947553873062134, pvalue=0.0)

In [272]:

stats.shapiro(df2['토지면적(㎡)'])

C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\_morestats.py:1800: UserWarning: p-value may not be accurate for N > 5000.
  warnings.warn("p-value may not be accurate for N > 5000.")

Out[272]:

ShapiroResult(statistic=0.3813157081604004, pvalue=0.0)

In [262]:

df2['토지면적(㎡)'].fillna(df2['토지면적(㎡)'].mean(), inplace=True)

In [264]:

stats.ks_2samp(df2['건물면적(㎡)'], df2['토지면적(㎡)'])

Out[264]:

KstestResult(statistic=0.49686874999999997, pvalue=0.0)

In [275]:

stats.ttest_ind(df2['건물면적(㎡)'], df2['토지면적(㎡)'], equal_var=False)

Out[275]:

Ttest_indResult(statistic=272.40307457631644, pvalue=0.0)

In [269]:

df2['건물면적(㎡)'].mean() > df2['토지면적(㎡)'].mean()

Out[269]:

True

In [ ]:

'Python > Jupyter notebook' 카테고리의 다른 글

#10 (0)	2023.01.16
#9 (0)	2023.01.16
#8-2 (0)	2023.01.16
#8-1 (0)	2023.01.16
#7 (0)	2023.01.16

ABOUT ME

so_nrisa so_nrisa

데이터 이산화

데이터 축소

소수점 아래 자리수 알아보기

소수점 아래 자리수가 있는지 없는지 체크하는 코드 만들어보기

고정소수점, 부동소수점 연산 시연

차원축소

0113 추가 실습

T-test 절차

실습

p-value 표기법

실습

'Python > Jupyter notebook' 카테고리의 다른 글

티스토리툴바

1	1	1	1	0
0	0	0	0	0
0	1	1	1	0
0	0	0	0	0
1	1	1	1	1
0	1	0	0	1
0	0	0	0	1
1	1	0	1	1
0	0	0	0	0
0	0	0	0	1
1	1	1	1	0
0	0	0	0	0
0	0	0	0	0
0	0	0	0	0
0	0	0	0	0
0	0	0	0	1
0	0	0	0	0
0	0	0	0	0

1	1	1	1	0
0	0	0	0	0
0	1	1	1	0
0	0	0	0	0
1	1	1	1	1
0	1	0	0	1
0	0	0	0	1
1	1	0	1	1
0	0	0	0	0
0	0	0	0	1
1	1	1	1	0
0	0	0	0	0
0	0	0	0	0
0	0	0	0	0
0	0	0	0	0
0	0	0	0	1
0	0	0	0	0
0	0	0	0	0

ABOUT ME

데이터 이산화

데이터 축소

소수점 아래 자리수 알아보기

소수점 아래 자리수가 있는지 없는지 체크하는 코드 만들어보기

고정소수점, 부동소수점 연산 시연

차원축소

0113 추가 실습

T-test 절차

실습

p-value 표기법

실습

'Python > Jupyter notebook' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바

1	1	1	1	0
0	0	0	0	0
0	1	1	1	0
0	0	0	0	0
1	1	1	1	1
0	1	0	0	1
0	0	0	0	1
1	1	0	1	1
0	0	0	0	0
0	0	0	0	1
1	1	1	1	0
0	0	0	0	0
0	0	0	0	0
0	0	0	0	0
0	0	0	0	0
0	0	0	0	1
0	0	0	0	0
0	0	0	0	0