-
#8-3Python/Jupyter notebook 2023. 1. 16. 21:36
데이터 이산화
In [1]:import pandas as pd
In [7]:fta = pd.read_excel('./FTA무역통계_20230109000036.xls', header=4, index_col=1)
C:\ProgramData\Anaconda3\lib\site-packages\openpyxl\styles\stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default")
- FTA 데이터에서 수출건수, 수출금액, 수입건수, 수입금액, 무역수지가
- 각각 평균 이상인 FTA와 평균 미만인 FTA로 이산화를 해 보자
- 이산화 데이터는 fta_bin 으로 저장한다
- 시간이 남는다면, 어떤 데이터를 가져와도 동작하는 표준 코드를 만들어보자
In [41]:fta['수출건수'].apply(lambda x: int(x.replace(' ','').replace(',',''))).mean() # 띄어쓰기, 콤마 없애기
Out[41]:884149.9444444445
In [9]:fta.columns
Out[9]:Index(['기간', '수출건수', '수출금액', '수입건수', '수입금액', '무역수지'], dtype='object')
In [11]:target = fta.columns[1:] fta_bin = pd.DataFrame() df = fta for c in target: col = df[c].apply(lambda x: int(x.replace(' ','').replace(',',''))) fta_bin[c] = col > col.mean()
In [19]:fta_bin # True, False 이진분류
Out[19]:수출건수수출금액수입건수수입금액무역수지FTA명RCEP한-EFTA한-EU한-뉴질랜드한-미국한-베트남한-싱가포르한-아세안한-영국한-인도한-중국한-중미한-칠레한-캐나다한-콜롬비아한-터키한-페루한-호주True True True True False False False False False False False True True True False False False False False False True True True True True False True False False True False False False False True True True False True True False False False False False False False False False True True True True True False False False False False False False False False False False False False False False False False False False False False False False False False True False False False False False False False False False False In [20]:fta_bin.replace([True,False], [1,0]) # DataFrame에 replace 함수 사용해보기. 여러 값 입력할 때는 리스트
Out[20]:수출건수수출금액수입건수수입금액무역수지FTA명RCEP한-EFTA한-EU한-뉴질랜드한-미국한-베트남한-싱가포르한-아세안한-영국한-인도한-중국한-중미한-칠레한-캐나다한-콜롬비아한-터키한-페루한-호주1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 1 1 1 1 0 1 0 0 1 0 0 0 0 1 1 1 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
데이터 축소
- 앞서 배운 내용을 기반으로 데이터 축소를 해 보자
- 단, 차원축소 제외
In [93]:df2 = pd.read_csv('./서울시 부동산 실거래가 정보.csv', encoding='cp949')
C:\Users\user\AppData\Local\Temp\ipykernel_18480\496141101.py:1: DtypeWarning: Columns (7,19,20) have mixed types. Specify dtype option on import or set low_memory=False. df2 = pd.read_csv('./서울시 부동산 실거래가 정보.csv', encoding='cp949')
In [97]:df2[['자치구코드','자치구명']]
Out[97]:자치구코드자치구명01234...63999563999663999763999863999911710 송파구 11140 중구 11260 중랑구 11740 강동구 11380 은평구 ... ... 11320 도봉구 11290 성북구 11320 도봉구 11305 강북구 11380 은평구 640000 rows × 2 columns
In [67]:df2['권리구분'].unique()
Out[67]:array([nan, '입주권', '분양권'], dtype=object)
In [69]:df2['신고구분'].unique()
Out[69]:array(['중개거래', '직거래', nan], dtype=object)
In [72]:df2[['자치구코드','자치구명']].drop_duplicates() # 해당 데이터를 메타데이터로 저장하면 둘 중 하나를 제거할 수 있음
Out[72]:자치구코드자치구명012345781012141517202122252731333448607011411710 송파구 11140 중구 11260 중랑구 11740 강동구 11380 은평구 11230 동대문구 11545 금천구 11200 성동구 11470 양천구 11410 서대문구 11320 도봉구 11305 강북구 11590 동작구 11620 관악구 11560 영등포구 11530 구로구 11650 서초구 11500 강서구 11215 광진구 11350 노원구 11680 강남구 11290 성북구 11170 용산구 11110 종로구 11440 마포구 In [73]:df2[['법정동코드','법정동명']].drop_duplicates()
Out[73]:법정동코드법정동명01234...56729458166259997360100861755910600 삼전동 16100 저동2가 10300 중화동 10900 천호동 10800 역촌동 ... ... 12600 명동1가 16600 서소문동 15900 초동 13000 남산동3가 11000 수표동 418 rows × 2 columns
In [74]:df2[['지번구분','지번구분명']].drop_duplicates()
Out[74]:지번구분지번구분명028127346391.0 대지 NaN NaN 3.0 블럭 2.0 산 In [79]:df2.drop(['접수연도','자치구명','법정동명','지번구분명'], inplace=True, axis='columns')
In [80]:#df2.to_feather('./result.feather') feather 등의 I/O 부담이 덜한 확장자를 이용할 수 있음
Out[80]:자치구코드법정동코드지번구분본번부번건물명계약일물건금액(만원)건물면적(㎡)토지면적(㎡)층권리구분취소일건축년도건물용도신고구분신고한 개업공인중개사 시군구명01234...63999563999663999763999863999911710 10600 1.0 43.0 3.0 더트라이앵글 20230106 27000 16.59 12.15 3.0 NaN NaN 2021.0 연립다세대 중개거래 서울 송파구 11140 16100 1.0 7.0 2.0 제이매크로 타워 20230106 20300 19.35 31.04 9.0 NaN NaN 2018.0 오피스텔 중개거래 서울 중구 11260 10300 1.0 295.0 38.0 대일빌라 20230106 13500 44.00 27.58 2.0 NaN NaN 1994.0 연립다세대 중개거래 서울 중랑구 11740 10900 1.0 425.0 3.0 천호역 한강 푸르지오시티 20230106 18800 25.24 35.91 28.0 NaN NaN 2015.0 오피스텔 중개거래 서울 강동구 11380 10800 1.0 80.0 2.0 임창에버빌(80-2) 20230106 22500 40.48 15.96 6.0 NaN NaN 2011.0 연립다세대 중개거래 서울 은평구 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 11320 10500 1.0 56.0 0.0 삼익세라믹 20190604 22000 42.93 NaN 8.0 NaN NaN 1988.0 아파트 NaN NaN 11290 13400 1.0 1278.0 0.0 길음동동부센트레빌(1278-0) 20190604 49800 60.00 NaN 8.0 NaN NaN 2003.0 아파트 NaN NaN 11320 10600 1.0 271.0 1.0 신동아아파트1 20190604 25000 53.16 NaN 15.0 NaN NaN 1986.0 아파트 NaN NaN 11305 10300 1.0 169.0 5.0 드림하이빌(169-5) 20190604 19500 43.88 27.20 5.0 NaN NaN 2015.0 연립다세대 NaN NaN 11380 10900 1.0 261.0 20.0 성락타운(261-20) 20190604 20250 49.35 45.72 1.0 NaN NaN 1987.0 연립다세대 NaN NaN 640000 rows × 17 columns
소수점 아래 자리수 알아보기
- 45.12와 45는 각각 실수와 정수라는 차이가 있다
- 45.12와 45는 .12라는 값의 차이가 있다
- 45.12와 45는 유효숫자의 자릿수 차이가 있다
소수점 아래 자리수가 있는지 없는지 체크하는 코드 만들어보기
In [98]:round(45.12) # 반올림하는 함수
Out[98]:45
In [99]:int(45.12)
Out[99]:45
In [100]:type(45.12), type(45)
Out[100]:(float, int)
In [102]:str(45.12), str(45)
Out[102]:('45.12', '45')
In [83]:df2['물건금액(만원)']/100 # 유효숫자 변동이 없다면, 단위를 조절하는 의미가 크지 않다
Out[83]:0 270.0 1 203.0 2 135.0 3 188.0 4 225.0 ... 639995 220.0 639996 498.0 639997 250.0 639998 195.0 639999 202.5 Name: 물건금액(만원), Length: 640000, dtype: float64
In [91]:# 단위 변환시 소수점이 생기지 않는게 바람직함. 단, 원본 데이터가 소수점이 있는 경우 예외 price = df2['물건금액(만원)']/100 price_int = price.apply(lambda x: int(x)) (price == price_int).unique()
Out[91]:array([ True, False])
고정소수점, 부동소수점 연산 시연
In [104]:float(45.6565) # 부동소수점
Out[104]:45.6565
In [108]:float1 = 45.6565
In [110]:'%.70f' %float1
Out[110]:'45.6565000000000011937117960769683122634887695312500000000000000000000000'
In [105]:import decimal # 고정소수점
In [127]:decimal.Decimal(45) / decimal.Decimal(6)
Out[127]:Decimal('7.5')
In [121]:float3 = float(7.55)
In [126]:'%.70f' %float3
Out[126]:'7.5499999999999998223643160599749535322189331054687500000000000000000000'
차원축소
In [ ]:# sklearn이 안깔렸을 때는 # 명령 프롬프트 창에 pip install sklearn
In [21]:from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
In [24]:from sklearn.datasets import load_iris # 붓꽃 데이터
In [25]:iris = load_iris()
In [28]:iris
Out[28]:{'data': array([[5.1, 3.5, 1.4, 0.2], [4.9, 3. , 1.4, 0.2], [4.7, 3.2, 1.3, 0.2], [4.6, 3.1, 1.5, 0.2], [5. , 3.6, 1.4, 0.2], [5.4, 3.9, 1.7, 0.4], [4.6, 3.4, 1.4, 0.3], [5. , 3.4, 1.5, 0.2], [4.4, 2.9, 1.4, 0.2], [4.9, 3.1, 1.5, 0.1], [5.4, 3.7, 1.5, 0.2], [4.8, 3.4, 1.6, 0.2], [4.8, 3. , 1.4, 0.1], [4.3, 3. , 1.1, 0.1], [5.8, 4. , 1.2, 0.2], [5.7, 4.4, 1.5, 0.4], [5.4, 3.9, 1.3, 0.4], [5.1, 3.5, 1.4, 0.3], [5.7, 3.8, 1.7, 0.3], [5.1, 3.8, 1.5, 0.3], [5.4, 3.4, 1.7, 0.2], [5.1, 3.7, 1.5, 0.4], [4.6, 3.6, 1. , 0.2], [5.1, 3.3, 1.7, 0.5], [4.8, 3.4, 1.9, 0.2], [5. , 3. , 1.6, 0.2], [5. , 3.4, 1.6, 0.4], [5.2, 3.5, 1.5, 0.2], [5.2, 3.4, 1.4, 0.2], [4.7, 3.2, 1.6, 0.2], [4.8, 3.1, 1.6, 0.2], [5.4, 3.4, 1.5, 0.4], [5.2, 4.1, 1.5, 0.1], [5.5, 4.2, 1.4, 0.2], [4.9, 3.1, 1.5, 0.2], [5. , 3.2, 1.2, 0.2], [5.5, 3.5, 1.3, 0.2], [4.9, 3.6, 1.4, 0.1], [4.4, 3. , 1.3, 0.2], [5.1, 3.4, 1.5, 0.2], [5. , 3.5, 1.3, 0.3], [4.5, 2.3, 1.3, 0.3], [4.4, 3.2, 1.3, 0.2], [5. , 3.5, 1.6, 0.6], [5.1, 3.8, 1.9, 0.4], [4.8, 3. , 1.4, 0.3], [5.1, 3.8, 1.6, 0.2], [4.6, 3.2, 1.4, 0.2], [5.3, 3.7, 1.5, 0.2], [5. , 3.3, 1.4, 0.2], [7. , 3.2, 4.7, 1.4], [6.4, 3.2, 4.5, 1.5], [6.9, 3.1, 4.9, 1.5], [5.5, 2.3, 4. , 1.3], [6.5, 2.8, 4.6, 1.5], [5.7, 2.8, 4.5, 1.3], [6.3, 3.3, 4.7, 1.6], [4.9, 2.4, 3.3, 1. ], [6.6, 2.9, 4.6, 1.3], [5.2, 2.7, 3.9, 1.4], [5. , 2. , 3.5, 1. ], [5.9, 3. , 4.2, 1.5], [6. , 2.2, 4. , 1. ], [6.1, 2.9, 4.7, 1.4], [5.6, 2.9, 3.6, 1.3], [6.7, 3.1, 4.4, 1.4], [5.6, 3. , 4.5, 1.5], [5.8, 2.7, 4.1, 1. ], [6.2, 2.2, 4.5, 1.5], [5.6, 2.5, 3.9, 1.1], [5.9, 3.2, 4.8, 1.8], [6.1, 2.8, 4. , 1.3], [6.3, 2.5, 4.9, 1.5], [6.1, 2.8, 4.7, 1.2], [6.4, 2.9, 4.3, 1.3], [6.6, 3. , 4.4, 1.4], [6.8, 2.8, 4.8, 1.4], [6.7, 3. , 5. , 1.7], [6. , 2.9, 4.5, 1.5], [5.7, 2.6, 3.5, 1. ], [5.5, 2.4, 3.8, 1.1], [5.5, 2.4, 3.7, 1. ], [5.8, 2.7, 3.9, 1.2], [6. , 2.7, 5.1, 1.6], [5.4, 3. , 4.5, 1.5], [6. , 3.4, 4.5, 1.6], [6.7, 3.1, 4.7, 1.5], [6.3, 2.3, 4.4, 1.3], [5.6, 3. , 4.1, 1.3], [5.5, 2.5, 4. , 1.3], [5.5, 2.6, 4.4, 1.2], [6.1, 3. , 4.6, 1.4], [5.8, 2.6, 4. , 1.2], [5. , 2.3, 3.3, 1. ], [5.6, 2.7, 4.2, 1.3], [5.7, 3. , 4.2, 1.2], [5.7, 2.9, 4.2, 1.3], [6.2, 2.9, 4.3, 1.3], [5.1, 2.5, 3. , 1.1], [5.7, 2.8, 4.1, 1.3], [6.3, 3.3, 6. , 2.5], [5.8, 2.7, 5.1, 1.9], [7.1, 3. , 5.9, 2.1], [6.3, 2.9, 5.6, 1.8], [6.5, 3. , 5.8, 2.2], [7.6, 3. , 6.6, 2.1], [4.9, 2.5, 4.5, 1.7], [7.3, 2.9, 6.3, 1.8], [6.7, 2.5, 5.8, 1.8], [7.2, 3.6, 6.1, 2.5], [6.5, 3.2, 5.1, 2. ], [6.4, 2.7, 5.3, 1.9], [6.8, 3. , 5.5, 2.1], [5.7, 2.5, 5. , 2. ], [5.8, 2.8, 5.1, 2.4], [6.4, 3.2, 5.3, 2.3], [6.5, 3. , 5.5, 1.8], [7.7, 3.8, 6.7, 2.2], [7.7, 2.6, 6.9, 2.3], [6. , 2.2, 5. , 1.5], [6.9, 3.2, 5.7, 2.3], [5.6, 2.8, 4.9, 2. ], [7.7, 2.8, 6.7, 2. ], [6.3, 2.7, 4.9, 1.8], [6.7, 3.3, 5.7, 2.1], [7.2, 3.2, 6. , 1.8], [6.2, 2.8, 4.8, 1.8], [6.1, 3. , 4.9, 1.8], [6.4, 2.8, 5.6, 2.1], [7.2, 3. , 5.8, 1.6], [7.4, 2.8, 6.1, 1.9], [7.9, 3.8, 6.4, 2. ], [6.4, 2.8, 5.6, 2.2], [6.3, 2.8, 5.1, 1.5], [6.1, 2.6, 5.6, 1.4], [7.7, 3. , 6.1, 2.3], [6.3, 3.4, 5.6, 2.4], [6.4, 3.1, 5.5, 1.8], [6. , 3. , 4.8, 1.8], [6.9, 3.1, 5.4, 2.1], [6.7, 3.1, 5.6, 2.4], [6.9, 3.1, 5.1, 2.3], [5.8, 2.7, 5.1, 1.9], [6.8, 3.2, 5.9, 2.3], [6.7, 3.3, 5.7, 2.5], [6.7, 3. , 5.2, 2.3], [6.3, 2.5, 5. , 1.9], [6.5, 3. , 5.2, 2. ], [6.2, 3.4, 5.4, 2.3], [5.9, 3. , 5.1, 1.8]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]), 'frame': None, 'target_names': array(['setosa', 'versicolor', 'virginica'], dtype='<U10'), 'DESCR': '.. _iris_dataset:\n\nIris plants dataset\n--------------------\n\n**Data Set Characteristics:**\n\n :Number of Instances: 150 (50 in each of three classes)\n :Number of Attributes: 4 numeric, predictive attributes and the class\n :Attribute Information:\n - sepal length in cm\n - sepal width in cm\n - petal length in cm\n - petal width in cm\n - class:\n - Iris-Setosa\n - Iris-Versicolour\n - Iris-Virginica\n \n :Summary Statistics:\n\n ============== ==== ==== ======= ===== ====================\n Min Max Mean SD Class Correlation\n ============== ==== ==== ======= ===== ====================\n sepal length: 4.3 7.9 5.84 0.83 0.7826\n sepal width: 2.0 4.4 3.05 0.43 -0.4194\n petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\n petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\n ============== ==== ==== ======= ===== ====================\n\n :Missing Attribute Values: None\n :Class Distribution: 33.3% for each of 3 classes.\n :Creator: R.A. Fisher\n :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n :Date: July, 1988\n\nThe famous Iris database, first used by Sir R.A. Fisher. The dataset is taken\nfrom Fisher\'s paper. Note that it\'s the same as in R, but not as in the UCI\nMachine Learning Repository, which has two wrong data points.\n\nThis is perhaps the best known database to be found in the\npattern recognition literature. Fisher\'s paper is a classic in the field and\nis referenced frequently to this day. (See Duda & Hart, for example.) The\ndata set contains 3 classes of 50 instances each, where each class refers to a\ntype of iris plant. One class is linearly separable from the other 2; the\nlatter are NOT linearly separable from each other.\n\n.. topic:: References\n\n - Fisher, R.A. "The use of multiple measurements in taxonomic problems"\n Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to\n Mathematical Statistics" (John Wiley, NY, 1950).\n - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.\n (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\n - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System\n Structure and Classification Rule for Recognition in Partially Exposed\n Environments". IEEE Transactions on Pattern Analysis and Machine\n Intelligence, Vol. PAMI-2, No. 1, 67-71.\n - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions\n on Information Theory, May 1972, 431-433.\n - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II\n conceptual clustering system finds 3 classes in the data.\n - Many, many more ...', 'feature_names': ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'], 'filename': 'iris.csv', 'data_module': 'sklearn.datasets.data'}
In [30]:iris.data
Out[30]:array([[5.1, 3.5, 1.4, 0.2], [4.9, 3. , 1.4, 0.2], [4.7, 3.2, 1.3, 0.2], [4.6, 3.1, 1.5, 0.2], [5. , 3.6, 1.4, 0.2], [5.4, 3.9, 1.7, 0.4], [4.6, 3.4, 1.4, 0.3], [5. , 3.4, 1.5, 0.2], [4.4, 2.9, 1.4, 0.2], [4.9, 3.1, 1.5, 0.1], [5.4, 3.7, 1.5, 0.2], [4.8, 3.4, 1.6, 0.2], [4.8, 3. , 1.4, 0.1], [4.3, 3. , 1.1, 0.1], [5.8, 4. , 1.2, 0.2], [5.7, 4.4, 1.5, 0.4], [5.4, 3.9, 1.3, 0.4], [5.1, 3.5, 1.4, 0.3], [5.7, 3.8, 1.7, 0.3], [5.1, 3.8, 1.5, 0.3], [5.4, 3.4, 1.7, 0.2], [5.1, 3.7, 1.5, 0.4], [4.6, 3.6, 1. , 0.2], [5.1, 3.3, 1.7, 0.5], [4.8, 3.4, 1.9, 0.2], [5. , 3. , 1.6, 0.2], [5. , 3.4, 1.6, 0.4], [5.2, 3.5, 1.5, 0.2], [5.2, 3.4, 1.4, 0.2], [4.7, 3.2, 1.6, 0.2], [4.8, 3.1, 1.6, 0.2], [5.4, 3.4, 1.5, 0.4], [5.2, 4.1, 1.5, 0.1], [5.5, 4.2, 1.4, 0.2], [4.9, 3.1, 1.5, 0.2], [5. , 3.2, 1.2, 0.2], [5.5, 3.5, 1.3, 0.2], [4.9, 3.6, 1.4, 0.1], [4.4, 3. , 1.3, 0.2], [5.1, 3.4, 1.5, 0.2], [5. , 3.5, 1.3, 0.3], [4.5, 2.3, 1.3, 0.3], [4.4, 3.2, 1.3, 0.2], [5. , 3.5, 1.6, 0.6], [5.1, 3.8, 1.9, 0.4], [4.8, 3. , 1.4, 0.3], [5.1, 3.8, 1.6, 0.2], [4.6, 3.2, 1.4, 0.2], [5.3, 3.7, 1.5, 0.2], [5. , 3.3, 1.4, 0.2], [7. , 3.2, 4.7, 1.4], [6.4, 3.2, 4.5, 1.5], [6.9, 3.1, 4.9, 1.5], [5.5, 2.3, 4. , 1.3], [6.5, 2.8, 4.6, 1.5], [5.7, 2.8, 4.5, 1.3], [6.3, 3.3, 4.7, 1.6], [4.9, 2.4, 3.3, 1. ], [6.6, 2.9, 4.6, 1.3], [5.2, 2.7, 3.9, 1.4], [5. , 2. , 3.5, 1. ], [5.9, 3. , 4.2, 1.5], [6. , 2.2, 4. , 1. ], [6.1, 2.9, 4.7, 1.4], [5.6, 2.9, 3.6, 1.3], [6.7, 3.1, 4.4, 1.4], [5.6, 3. , 4.5, 1.5], [5.8, 2.7, 4.1, 1. ], [6.2, 2.2, 4.5, 1.5], [5.6, 2.5, 3.9, 1.1], [5.9, 3.2, 4.8, 1.8], [6.1, 2.8, 4. , 1.3], [6.3, 2.5, 4.9, 1.5], [6.1, 2.8, 4.7, 1.2], [6.4, 2.9, 4.3, 1.3], [6.6, 3. , 4.4, 1.4], [6.8, 2.8, 4.8, 1.4], [6.7, 3. , 5. , 1.7], [6. , 2.9, 4.5, 1.5], [5.7, 2.6, 3.5, 1. ], [5.5, 2.4, 3.8, 1.1], [5.5, 2.4, 3.7, 1. ], [5.8, 2.7, 3.9, 1.2], [6. , 2.7, 5.1, 1.6], [5.4, 3. , 4.5, 1.5], [6. , 3.4, 4.5, 1.6], [6.7, 3.1, 4.7, 1.5], [6.3, 2.3, 4.4, 1.3], [5.6, 3. , 4.1, 1.3], [5.5, 2.5, 4. , 1.3], [5.5, 2.6, 4.4, 1.2], [6.1, 3. , 4.6, 1.4], [5.8, 2.6, 4. , 1.2], [5. , 2.3, 3.3, 1. ], [5.6, 2.7, 4.2, 1.3], [5.7, 3. , 4.2, 1.2], [5.7, 2.9, 4.2, 1.3], [6.2, 2.9, 4.3, 1.3], [5.1, 2.5, 3. , 1.1], [5.7, 2.8, 4.1, 1.3], [6.3, 3.3, 6. , 2.5], [5.8, 2.7, 5.1, 1.9], [7.1, 3. , 5.9, 2.1], [6.3, 2.9, 5.6, 1.8], [6.5, 3. , 5.8, 2.2], [7.6, 3. , 6.6, 2.1], [4.9, 2.5, 4.5, 1.7], [7.3, 2.9, 6.3, 1.8], [6.7, 2.5, 5.8, 1.8], [7.2, 3.6, 6.1, 2.5], [6.5, 3.2, 5.1, 2. ], [6.4, 2.7, 5.3, 1.9], [6.8, 3. , 5.5, 2.1], [5.7, 2.5, 5. , 2. ], [5.8, 2.8, 5.1, 2.4], [6.4, 3.2, 5.3, 2.3], [6.5, 3. , 5.5, 1.8], [7.7, 3.8, 6.7, 2.2], [7.7, 2.6, 6.9, 2.3], [6. , 2.2, 5. , 1.5], [6.9, 3.2, 5.7, 2.3], [5.6, 2.8, 4.9, 2. ], [7.7, 2.8, 6.7, 2. ], [6.3, 2.7, 4.9, 1.8], [6.7, 3.3, 5.7, 2.1], [7.2, 3.2, 6. , 1.8], [6.2, 2.8, 4.8, 1.8], [6.1, 3. , 4.9, 1.8], [6.4, 2.8, 5.6, 2.1], [7.2, 3. , 5.8, 1.6], [7.4, 2.8, 6.1, 1.9], [7.9, 3.8, 6.4, 2. ], [6.4, 2.8, 5.6, 2.2], [6.3, 2.8, 5.1, 1.5], [6.1, 2.6, 5.6, 1.4], [7.7, 3. , 6.1, 2.3], [6.3, 3.4, 5.6, 2.4], [6.4, 3.1, 5.5, 1.8], [6. , 3. , 4.8, 1.8], [6.9, 3.1, 5.4, 2.1], [6.7, 3.1, 5.6, 2.4], [6.9, 3.1, 5.1, 2.3], [5.8, 2.7, 5.1, 1.9], [6.8, 3.2, 5.9, 2.3], [6.7, 3.3, 5.7, 2.5], [6.7, 3. , 5.2, 2.3], [6.3, 2.5, 5. , 1.9], [6.5, 3. , 5.2, 2. ], [6.2, 3.4, 5.4, 2.3], [5.9, 3. , 5.1, 1.8]])
In [32]:print(iris.DESCR) # 메타데이터
.. _iris_dataset: Iris plants dataset -------------------- **Data Set Characteristics:** :Number of Instances: 150 (50 in each of three classes) :Number of Attributes: 4 numeric, predictive attributes and the class :Attribute Information: - sepal length in cm - sepal width in cm - petal length in cm - petal width in cm - class: - Iris-Setosa - Iris-Versicolour - Iris-Virginica :Summary Statistics: ============== ==== ==== ======= ===== ==================== Min Max Mean SD Class Correlation ============== ==== ==== ======= ===== ==================== sepal length: 4.3 7.9 5.84 0.83 0.7826 sepal width: 2.0 4.4 3.05 0.43 -0.4194 petal length: 1.0 6.9 3.76 1.76 0.9490 (high!) petal width: 0.1 2.5 1.20 0.76 0.9565 (high!) ============== ==== ==== ======= ===== ==================== :Missing Attribute Values: None :Class Distribution: 33.3% for each of 3 classes. :Creator: R.A. Fisher :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov) :Date: July, 1988 The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken from Fisher's paper. Note that it's the same as in R, but not as in the UCI Machine Learning Repository, which has two wrong data points. This is perhaps the best known database to be found in the pattern recognition literature. Fisher's paper is a classic in the field and is referenced frequently to this day. (See Duda & Hart, for example.) The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant. One class is linearly separable from the other 2; the latter are NOT linearly separable from each other. .. topic:: References - Fisher, R.A. "The use of multiple measurements in taxonomic problems" Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to Mathematical Statistics" (John Wiley, NY, 1950). - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis. (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218. - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System Structure and Classification Rule for Recognition in Partially Exposed Environments". IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. PAMI-2, No. 1, 67-71. - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions on Information Theory, May 1972, 431-433. - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II conceptual clustering system finds 3 classes in the data. - Many, many more ...
In [33]:iris.feature_names
Out[33]:['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
In [63]:iris.target
Out[63]:array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
In [58]:scaler = StandardScaler() pca = PCA(n_components=3) lda = LinearDiscriminantAnalysis(n_components=2)
In [59]:scaler.fit(iris.data) pca.fit(iris.data)
Out[59]:PCA(n_components=3)
In [60]:scaled = scaler.transform(iris.data) reduced = pca.transform(iris.data)
In [64]:lda.fit(scaled, iris.target) iris_lda = lda.transform(scaled)
In [129]:reduced.shape
Out[129]:(150, 3)
In [131]:iris_lda.shape
Out[131]:(150, 2)
In [132]:iris
Out[132]:{'data': array([[5.1, 3.5, 1.4, 0.2], [4.9, 3. , 1.4, 0.2], [4.7, 3.2, 1.3, 0.2], [4.6, 3.1, 1.5, 0.2], [5. , 3.6, 1.4, 0.2], [5.4, 3.9, 1.7, 0.4], [4.6, 3.4, 1.4, 0.3], [5. , 3.4, 1.5, 0.2], [4.4, 2.9, 1.4, 0.2], [4.9, 3.1, 1.5, 0.1], [5.4, 3.7, 1.5, 0.2], [4.8, 3.4, 1.6, 0.2], [4.8, 3. , 1.4, 0.1], [4.3, 3. , 1.1, 0.1], [5.8, 4. , 1.2, 0.2], [5.7, 4.4, 1.5, 0.4], [5.4, 3.9, 1.3, 0.4], [5.1, 3.5, 1.4, 0.3], [5.7, 3.8, 1.7, 0.3], [5.1, 3.8, 1.5, 0.3], [5.4, 3.4, 1.7, 0.2], [5.1, 3.7, 1.5, 0.4], [4.6, 3.6, 1. , 0.2], [5.1, 3.3, 1.7, 0.5], [4.8, 3.4, 1.9, 0.2], [5. , 3. , 1.6, 0.2], [5. , 3.4, 1.6, 0.4], [5.2, 3.5, 1.5, 0.2], [5.2, 3.4, 1.4, 0.2], [4.7, 3.2, 1.6, 0.2], [4.8, 3.1, 1.6, 0.2], [5.4, 3.4, 1.5, 0.4], [5.2, 4.1, 1.5, 0.1], [5.5, 4.2, 1.4, 0.2], [4.9, 3.1, 1.5, 0.2], [5. , 3.2, 1.2, 0.2], [5.5, 3.5, 1.3, 0.2], [4.9, 3.6, 1.4, 0.1], [4.4, 3. , 1.3, 0.2], [5.1, 3.4, 1.5, 0.2], [5. , 3.5, 1.3, 0.3], [4.5, 2.3, 1.3, 0.3], [4.4, 3.2, 1.3, 0.2], [5. , 3.5, 1.6, 0.6], [5.1, 3.8, 1.9, 0.4], [4.8, 3. , 1.4, 0.3], [5.1, 3.8, 1.6, 0.2], [4.6, 3.2, 1.4, 0.2], [5.3, 3.7, 1.5, 0.2], [5. , 3.3, 1.4, 0.2], [7. , 3.2, 4.7, 1.4], [6.4, 3.2, 4.5, 1.5], [6.9, 3.1, 4.9, 1.5], [5.5, 2.3, 4. , 1.3], [6.5, 2.8, 4.6, 1.5], [5.7, 2.8, 4.5, 1.3], [6.3, 3.3, 4.7, 1.6], [4.9, 2.4, 3.3, 1. ], [6.6, 2.9, 4.6, 1.3], [5.2, 2.7, 3.9, 1.4], [5. , 2. , 3.5, 1. ], [5.9, 3. , 4.2, 1.5], [6. , 2.2, 4. , 1. ], [6.1, 2.9, 4.7, 1.4], [5.6, 2.9, 3.6, 1.3], [6.7, 3.1, 4.4, 1.4], [5.6, 3. , 4.5, 1.5], [5.8, 2.7, 4.1, 1. ], [6.2, 2.2, 4.5, 1.5], [5.6, 2.5, 3.9, 1.1], [5.9, 3.2, 4.8, 1.8], [6.1, 2.8, 4. , 1.3], [6.3, 2.5, 4.9, 1.5], [6.1, 2.8, 4.7, 1.2], [6.4, 2.9, 4.3, 1.3], [6.6, 3. , 4.4, 1.4], [6.8, 2.8, 4.8, 1.4], [6.7, 3. , 5. , 1.7], [6. , 2.9, 4.5, 1.5], [5.7, 2.6, 3.5, 1. ], [5.5, 2.4, 3.8, 1.1], [5.5, 2.4, 3.7, 1. ], [5.8, 2.7, 3.9, 1.2], [6. , 2.7, 5.1, 1.6], [5.4, 3. , 4.5, 1.5], [6. , 3.4, 4.5, 1.6], [6.7, 3.1, 4.7, 1.5], [6.3, 2.3, 4.4, 1.3], [5.6, 3. , 4.1, 1.3], [5.5, 2.5, 4. , 1.3], [5.5, 2.6, 4.4, 1.2], [6.1, 3. , 4.6, 1.4], [5.8, 2.6, 4. , 1.2], [5. , 2.3, 3.3, 1. ], [5.6, 2.7, 4.2, 1.3], [5.7, 3. , 4.2, 1.2], [5.7, 2.9, 4.2, 1.3], [6.2, 2.9, 4.3, 1.3], [5.1, 2.5, 3. , 1.1], [5.7, 2.8, 4.1, 1.3], [6.3, 3.3, 6. , 2.5], [5.8, 2.7, 5.1, 1.9], [7.1, 3. , 5.9, 2.1], [6.3, 2.9, 5.6, 1.8], [6.5, 3. , 5.8, 2.2], [7.6, 3. , 6.6, 2.1], [4.9, 2.5, 4.5, 1.7], [7.3, 2.9, 6.3, 1.8], [6.7, 2.5, 5.8, 1.8], [7.2, 3.6, 6.1, 2.5], [6.5, 3.2, 5.1, 2. ], [6.4, 2.7, 5.3, 1.9], [6.8, 3. , 5.5, 2.1], [5.7, 2.5, 5. , 2. ], [5.8, 2.8, 5.1, 2.4], [6.4, 3.2, 5.3, 2.3], [6.5, 3. , 5.5, 1.8], [7.7, 3.8, 6.7, 2.2], [7.7, 2.6, 6.9, 2.3], [6. , 2.2, 5. , 1.5], [6.9, 3.2, 5.7, 2.3], [5.6, 2.8, 4.9, 2. ], [7.7, 2.8, 6.7, 2. ], [6.3, 2.7, 4.9, 1.8], [6.7, 3.3, 5.7, 2.1], [7.2, 3.2, 6. , 1.8], [6.2, 2.8, 4.8, 1.8], [6.1, 3. , 4.9, 1.8], [6.4, 2.8, 5.6, 2.1], [7.2, 3. , 5.8, 1.6], [7.4, 2.8, 6.1, 1.9], [7.9, 3.8, 6.4, 2. ], [6.4, 2.8, 5.6, 2.2], [6.3, 2.8, 5.1, 1.5], [6.1, 2.6, 5.6, 1.4], [7.7, 3. , 6.1, 2.3], [6.3, 3.4, 5.6, 2.4], [6.4, 3.1, 5.5, 1.8], [6. , 3. , 4.8, 1.8], [6.9, 3.1, 5.4, 2.1], [6.7, 3.1, 5.6, 2.4], [6.9, 3.1, 5.1, 2.3], [5.8, 2.7, 5.1, 1.9], [6.8, 3.2, 5.9, 2.3], [6.7, 3.3, 5.7, 2.5], [6.7, 3. , 5.2, 2.3], [6.3, 2.5, 5. , 1.9], [6.5, 3. , 5.2, 2. ], [6.2, 3.4, 5.4, 2.3], [5.9, 3. , 5.1, 1.8]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]), 'frame': None, 'target_names': array(['setosa', 'versicolor', 'virginica'], dtype='<U10'), 'DESCR': '.. _iris_dataset:\n\nIris plants dataset\n--------------------\n\n**Data Set Characteristics:**\n\n :Number of Instances: 150 (50 in each of three classes)\n :Number of Attributes: 4 numeric, predictive attributes and the class\n :Attribute Information:\n - sepal length in cm\n - sepal width in cm\n - petal length in cm\n - petal width in cm\n - class:\n - Iris-Setosa\n - Iris-Versicolour\n - Iris-Virginica\n \n :Summary Statistics:\n\n ============== ==== ==== ======= ===== ====================\n Min Max Mean SD Class Correlation\n ============== ==== ==== ======= ===== ====================\n sepal length: 4.3 7.9 5.84 0.83 0.7826\n sepal width: 2.0 4.4 3.05 0.43 -0.4194\n petal length: 1.0 6.9 3.76 1.76 0.9490 (high!)\n petal width: 0.1 2.5 1.20 0.76 0.9565 (high!)\n ============== ==== ==== ======= ===== ====================\n\n :Missing Attribute Values: None\n :Class Distribution: 33.3% for each of 3 classes.\n :Creator: R.A. Fisher\n :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n :Date: July, 1988\n\nThe famous Iris database, first used by Sir R.A. Fisher. The dataset is taken\nfrom Fisher\'s paper. Note that it\'s the same as in R, but not as in the UCI\nMachine Learning Repository, which has two wrong data points.\n\nThis is perhaps the best known database to be found in the\npattern recognition literature. Fisher\'s paper is a classic in the field and\nis referenced frequently to this day. (See Duda & Hart, for example.) The\ndata set contains 3 classes of 50 instances each, where each class refers to a\ntype of iris plant. One class is linearly separable from the other 2; the\nlatter are NOT linearly separable from each other.\n\n.. topic:: References\n\n - Fisher, R.A. "The use of multiple measurements in taxonomic problems"\n Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to\n Mathematical Statistics" (John Wiley, NY, 1950).\n - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.\n (Q327.D83) John Wiley & Sons. ISBN 0-471-22361-1. See page 218.\n - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System\n Structure and Classification Rule for Recognition in Partially Exposed\n Environments". IEEE Transactions on Pattern Analysis and Machine\n Intelligence, Vol. PAMI-2, No. 1, 67-71.\n - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule". IEEE Transactions\n on Information Theory, May 1972, 431-433.\n - See also: 1988 MLC Proceedings, 54-64. Cheeseman et al"s AUTOCLASS II\n conceptual clustering system finds 3 classes in the data.\n - Many, many more ...', 'feature_names': ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'], 'filename': 'iris.csv', 'data_module': 'sklearn.datasets.data'}
In [133]:reduced
Out[133]:array([[-2.68412563, 0.31939725, -0.02791483], [-2.71414169, -0.17700123, -0.21046427], [-2.88899057, -0.14494943, 0.01790026], [-2.74534286, -0.31829898, 0.03155937], [-2.72871654, 0.32675451, 0.09007924], [-2.28085963, 0.74133045, 0.16867766], [-2.82053775, -0.08946138, 0.25789216], [-2.62614497, 0.16338496, -0.02187932], [-2.88638273, -0.57831175, 0.02075957], [-2.6727558 , -0.11377425, -0.19763272], [-2.50694709, 0.6450689 , -0.07531801], [-2.61275523, 0.01472994, 0.10215026], [-2.78610927, -0.235112 , -0.20684443], [-3.22380374, -0.51139459, 0.06129967], [-2.64475039, 1.17876464, -0.15162752], [-2.38603903, 1.33806233, 0.2777769 ], [-2.62352788, 0.81067951, 0.13818323], [-2.64829671, 0.31184914, 0.02666832], [-2.19982032, 0.87283904, -0.12030552], [-2.5879864 , 0.51356031, 0.21366517], [-2.31025622, 0.39134594, -0.23944404], [-2.54370523, 0.43299606, 0.20845723], [-3.21593942, 0.13346807, 0.29239675], [-2.30273318, 0.09870885, 0.03912326], [-2.35575405, -0.03728186, 0.12502108], [-2.50666891, -0.14601688, -0.25342004], [-2.46882007, 0.13095149, 0.09491058], [-2.56231991, 0.36771886, -0.07849421], [-2.63953472, 0.31203998, -0.1459089 ], [-2.63198939, -0.19696122, 0.04077108], [-2.58739848, -0.20431849, -0.07722299], [-2.4099325 , 0.41092426, -0.14552497], [-2.64886233, 0.81336382, 0.22566915], [-2.59873675, 1.09314576, 0.15781081], [-2.63692688, -0.12132235, -0.14304958], [-2.86624165, 0.06936447, -0.16433231], [-2.62523805, 0.59937002, -0.26835038], [-2.80068412, 0.26864374, 0.09369908], [-2.98050204, -0.48795834, 0.07292705], [-2.59000631, 0.22904384, -0.0800823 ], [-2.77010243, 0.26352753, 0.07724769], [-2.84936871, -0.94096057, -0.34923038], [-2.99740655, -0.34192606, 0.19250921], [-2.40561449, 0.18887143, 0.26386795], [-2.20948924, 0.43666314, 0.29874275], [-2.71445143, -0.2502082 , -0.09767814], [-2.53814826, 0.50377114, 0.16670564], [-2.83946217, -0.22794557, 0.08372685], [-2.54308575, 0.57941002, -0.01711502], [-2.70335978, 0.10770608, -0.08929401], [ 1.28482569, 0.68516047, -0.40656803], [ 0.93248853, 0.31833364, -0.01801419], [ 1.46430232, 0.50426282, -0.33832576], [ 0.18331772, -0.82795901, -0.17959139], [ 1.08810326, 0.07459068, -0.3077579 ], [ 0.64166908, -0.41824687, 0.04107609], [ 1.09506066, 0.28346827, 0.16981024], [-0.74912267, -1.00489096, 0.01230292], [ 1.04413183, 0.2283619 , -0.41533608], [-0.0087454 , -0.72308191, 0.28114143], [-0.50784088, -1.26597119, -0.26981718], [ 0.51169856, -0.10398124, 0.13054775], [ 0.26497651, -0.55003646, -0.69414683], [ 0.98493451, -0.12481785, -0.06211441], [-0.17392537, -0.25485421, 0.09045769], [ 0.92786078, 0.46717949, -0.31462098], [ 0.66028376, -0.35296967, 0.32802753], [ 0.23610499, -0.33361077, -0.27116184], [ 0.94473373, -0.54314555, -0.49951905], [ 0.04522698, -0.58383438, -0.2350021 ], [ 1.11628318, -0.08461685, 0.45962099], [ 0.35788842, -0.06892503, -0.22985389], [ 1.29818388, -0.32778731, -0.34785435], [ 0.92172892, -0.18273779, -0.23107178], [ 0.71485333, 0.14905594, -0.32180094], [ 0.90017437, 0.32850447, -0.31620907], [ 1.33202444, 0.24444088, -0.52170278], [ 1.55780216, 0.26749545, -0.16492098], [ 0.81329065, -0.1633503 , 0.0354245 ], [-0.30558378, -0.36826219, -0.31849158], [-0.06812649, -0.70517213, -0.24421381], [-0.18962247, -0.68028676, -0.30642056], [ 0.13642871, -0.31403244, -0.17724277], [ 1.38002644, -0.42095429, 0.01616713], [ 0.58800644, -0.48428742, 0.4444335 ], [ 0.80685831, 0.19418231, 0.38896306], [ 1.22069088, 0.40761959, -0.23716701], [ 0.81509524, -0.37203706, -0.61472084], [ 0.24595768, -0.2685244 , 0.18836681], [ 0.16641322, -0.68192672, -0.06000923], [ 0.46480029, -0.67071154, -0.02430686], [ 0.8908152 , -0.03446444, -0.00994693], [ 0.23054802, -0.40438585, -0.22941024], [-0.70453176, -1.01224823, -0.10569115], [ 0.35698149, -0.50491009, 0.01661717], [ 0.33193448, -0.21265468, 0.08320429], [ 0.37621565, -0.29321893, 0.07799635], [ 0.64257601, 0.01773819, -0.20539497], [-0.90646986, -0.75609337, -0.01259965], [ 0.29900084, -0.34889781, 0.01058166], [ 2.53119273, -0.00984911, 0.76016543], [ 1.41523588, -0.57491635, 0.29632253], [ 2.61667602, 0.34390315, -0.11078788], [ 1.97153105, -0.1797279 , 0.10842466], [ 2.35000592, -0.04026095, 0.28538956], [ 3.39703874, 0.55083667, -0.34843756], [ 0.52123224, -1.19275873, 0.5456593 ], [ 2.93258707, 0.3555 , -0.42023994], [ 2.32122882, -0.2438315 , -0.34830439], [ 2.91675097, 0.78279195, 0.42333542], [ 1.66177415, 0.24222841, 0.24244019], [ 1.80340195, -0.21563762, -0.03764817], [ 2.1655918 , 0.21627559, 0.03332664], [ 1.34616358, -0.77681835, 0.28190288], [ 1.58592822, -0.53964071, 0.62902933], [ 1.90445637, 0.11925069, 0.47963982], [ 1.94968906, 0.04194326, 0.04418617], [ 3.48705536, 1.17573933, 0.13389487], [ 3.79564542, 0.25732297, -0.51376776], [ 1.30079171, -0.76114964, -0.34499504], [ 2.42781791, 0.37819601, 0.21911932], [ 1.19900111, -0.60609153, 0.51185551], [ 3.49992004, 0.4606741 , -0.57318224], [ 1.38876613, -0.20439933, -0.06452276], [ 2.2754305 , 0.33499061, 0.28615009], [ 2.61409047, 0.56090136, -0.20553452], [ 1.25850816, -0.17970479, 0.0458477 ], [ 1.29113206, -0.11666865, 0.23125646], [ 2.12360872, -0.20972948, 0.15418002], [ 2.38800302, 0.4646398 , -0.44953019], [ 2.84167278, 0.37526917, -0.49889808], [ 3.23067366, 1.37416509, -0.11454821], [ 2.15943764, -0.21727758, 0.20876317], [ 1.44416124, -0.14341341, -0.15323389], [ 1.78129481, -0.49990168, -0.17287519], [ 3.07649993, 0.68808568, -0.33559229], [ 2.14424331, 0.1400642 , 0.73487894], [ 1.90509815, 0.04930053, 0.16218024], [ 1.16932634, -0.16499026, 0.28183584], [ 2.10761114, 0.37228787, 0.02729113], [ 2.31415471, 0.18365128, 0.32269375], [ 1.9222678 , 0.40920347, 0.1135866 ], [ 1.41523588, -0.57491635, 0.29632253], [ 2.56301338, 0.2778626 , 0.29256952], [ 2.41874618, 0.3047982 , 0.50448266], [ 1.94410979, 0.1875323 , 0.17782509], [ 1.52716661, -0.37531698, -0.12189817], [ 1.76434572, 0.07885885, 0.13048163], [ 1.90094161, 0.11662796, 0.72325156], [ 1.39018886, -0.28266094, 0.36290965]])
In [134]:iris_lda
Out[134]:array([[ 8.06179978e+00, 3.00420621e-01], [ 7.12868772e+00, -7.86660426e-01], [ 7.48982797e+00, -2.65384488e-01], [ 6.81320057e+00, -6.70631068e-01], [ 8.13230933e+00, 5.14462530e-01], [ 7.70194674e+00, 1.46172097e+00], [ 7.21261762e+00, 3.55836209e-01], [ 7.60529355e+00, -1.16338380e-02], [ 6.56055159e+00, -1.01516362e+00], [ 7.34305989e+00, -9.47319209e-01], [ 8.39738652e+00, 6.47363392e-01], [ 7.21929685e+00, -1.09646389e-01], [ 7.32679599e+00, -1.07298943e+00], [ 7.57247066e+00, -8.05464137e-01], [ 9.84984300e+00, 1.58593698e+00], [ 9.15823890e+00, 2.73759647e+00], [ 8.58243141e+00, 1.83448945e+00], [ 7.78075375e+00, 5.84339407e-01], [ 8.07835876e+00, 9.68580703e-01], [ 8.02097451e+00, 1.14050366e+00], [ 7.49680227e+00, -1.88377220e-01], [ 7.58648117e+00, 1.20797032e+00], [ 8.68104293e+00, 8.77590154e-01], [ 6.25140358e+00, 4.39696367e-01], [ 6.55893336e+00, -3.89222752e-01], [ 6.77138315e+00, -9.70634453e-01], [ 6.82308032e+00, 4.63011612e-01], [ 7.92461638e+00, 2.09638715e-01], [ 7.99129024e+00, 8.63787128e-02], [ 6.82946447e+00, -5.44960851e-01], [ 6.75895493e+00, -7.59002759e-01], [ 7.37495254e+00, 5.65844592e-01], [ 9.12634625e+00, 1.22443267e+00], [ 9.46768199e+00, 1.82522635e+00], [ 7.06201386e+00, -6.63400423e-01], [ 7.95876243e+00, -1.64961722e-01], [ 8.61367201e+00, 4.03253602e-01], [ 8.33041759e+00, 2.28133530e-01], [ 6.93412007e+00, -7.05519379e-01], [ 7.68823131e+00, -9.22362309e-03], [ 7.91793715e+00, 6.75121313e-01], [ 5.66188065e+00, -1.93435524e+00], [ 7.24101468e+00, -2.72615132e-01], [ 6.41443556e+00, 1.24730131e+00], [ 6.85944381e+00, 1.05165396e+00], [ 6.76470393e+00, -5.05151855e-01], [ 8.08189937e+00, 7.63392750e-01], [ 7.18676904e+00, -3.60986823e-01], [ 8.31444876e+00, 6.44953177e-01], [ 7.67196741e+00, -1.34893840e-01], [-1.45927545e+00, 2.85437643e-02], [-1.79770574e+00, 4.84385502e-01], [-2.41694888e+00, -9.27840307e-02], [-2.26247349e+00, -1.58725251e+00], [-2.54867836e+00, -4.72204898e-01], [-2.42996725e+00, -9.66132066e-01], [-2.44848456e+00, 7.95961954e-01], [-2.22666513e-01, -1.58467318e+00], [-1.75020123e+00, -8.21180130e-01], [-1.95842242e+00, -3.51563753e-01], [-1.19376031e+00, -2.63445570e+00], [-1.85892567e+00, 3.19006544e-01], [-1.15809388e+00, -2.64340991e+00], [-2.66605725e+00, -6.42504540e-01], [-3.78367218e-01, 8.66389312e-02], [-1.20117255e+00, 8.44373592e-02], [-2.76810246e+00, 3.21995363e-02], [-7.76854039e-01, -1.65916185e+00], [-3.49805433e+00, -1.68495616e+00], [-1.09042788e+00, -1.62658350e+00], [-3.71589615e+00, 1.04451442e+00], [-9.97610366e-01, -4.90530602e-01], [-3.83525931e+00, -1.40595806e+00], [-2.25741249e+00, -1.42679423e+00], [-1.25571326e+00, -5.46424197e-01], [-1.43755762e+00, -1.34424979e-01], [-2.45906137e+00, -9.35277280e-01], [-3.51848495e+00, 1.60588866e-01], [-2.58979871e+00, -1.74611728e-01], [ 3.07487884e-01, -1.31887146e+00], [-1.10669179e+00, -1.75225371e+00], [-6.05524589e-01, -1.94298038e+00], [-8.98703769e-01, -9.04940034e-01], [-4.49846635e+00, -8.82749915e-01], [-2.93397799e+00, 2.73791065e-02], [-2.10360821e+00, 1.19156767e+00], [-2.14258208e+00, 8.87797815e-02], [-2.47945603e+00, -1.94073927e+00], [-1.32552574e+00, -1.62869550e-01], [-1.95557887e+00, -1.15434826e+00], [-2.40157020e+00, -1.59458341e+00], [-2.29248878e+00, -3.32860296e-01], [-1.27227224e+00, -1.21458428e+00], [-2.93176055e-01, -1.79871509e+00], [-2.00598883e+00, -9.05418042e-01], [-1.18166311e+00, -5.37570242e-01], [-1.61615645e+00, -4.70103580e-01], [-1.42158879e+00, -5.51244626e-01], [ 4.75973788e-01, -7.99905482e-01], [-1.54948259e+00, -5.93363582e-01], [-7.83947399e+00, 2.13973345e+00], [-5.50747997e+00, -3.58139892e-02], [-6.29200850e+00, 4.67175777e-01], [-5.60545633e+00, -3.40738058e-01], [-6.85055995e+00, 8.29825394e-01], [-7.41816784e+00, -1.73117995e-01], [-4.67799541e+00, -4.99095015e-01], [-6.31692685e+00, -9.68980756e-01], [-6.32773684e+00, -1.38328993e+00], [-6.85281335e+00, 2.71758963e+00], [-4.44072512e+00, 1.34723692e+00], [-5.45009572e+00, -2.07736942e-01], [-5.66033713e+00, 8.32713617e-01], [-5.95823722e+00, -9.40175447e-02], [-6.75926282e+00, 1.60023206e+00], [-5.80704331e+00, 2.01019882e+00], [-5.06601233e+00, -2.62733839e-02], [-6.60881882e+00, 1.75163587e+00], [-9.17147486e+00, -7.48255067e-01], [-4.76453569e+00, -2.15573720e+00], [-6.27283915e+00, 1.64948141e+00], [-5.36071189e+00, 6.46120732e-01], [-7.58119982e+00, -9.80722934e-01], [-4.37150279e+00, -1.21297458e-01], [-5.72317531e+00, 1.29327553e+00], [-5.27915920e+00, -4.24582377e-02], [-4.08087208e+00, 1.85936572e-01], [-4.07703640e+00, 5.23238483e-01], [-6.51910397e+00, 2.96976389e-01], [-4.58371942e+00, -8.56815813e-01], [-6.22824009e+00, -7.12719638e-01], [-5.22048773e+00, 1.46819509e+00], [-6.80015000e+00, 5.80895175e-01], [-3.81515972e+00, -9.42985932e-01], [-5.10748966e+00, -2.13059000e+00], [-6.79671631e+00, 8.63090395e-01], [-6.52449599e+00, 2.44503527e+00], [-4.99550279e+00, 1.87768525e-01], [-3.93985300e+00, 6.14020389e-01], [-5.20383090e+00, 1.14476808e+00], [-6.65308685e+00, 1.80531976e+00], [-5.10555946e+00, 1.99218201e+00], [-5.50747997e+00, -3.58139892e-02], [-6.79601924e+00, 1.46068695e+00], [-6.84735943e+00, 2.42895067e+00], [-5.64500346e+00, 1.67771734e+00], [-5.17956460e+00, -3.63475041e-01], [-4.96774090e+00, 8.21140550e-01], [-5.88614539e+00, 2.34509051e+00], [-4.68315426e+00, 3.32033811e-01]])
In [ ]:# scaling: 통계적인 이점 = 같은 스케일로 값을 비교할 수 있다는 이점 # 그리고, ML 모델에 적용할 수 있는 이점이 있다
In [135]:from sklearn.preprocessing import MinMaxScaler
In [137]:data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
In [146]:df = pd.DataFrame(data) df
Out[146]:010123-1.0 2 -0.5 6 0.0 10 1.0 18 In [144]:pd.DataFrame(scaler.transform(data))
Out[144]:0101230.00 0.00 0.25 0.25 0.50 0.50 1.00 1.00 In [138]:scaler = MinMaxScaler()
In [139]:scaler.fit(data)
Out[139]:MinMaxScaler()
In [140]:scaler.transform(data) # 각각의 scale이 어떤 기준으로 할당되는가? -> DataFrame 기준, 각 column 단위로 scaling # 왜 [1,18]이 [1,1]로 저장되는가? -> scaling 기준이 서로 다르기 때문
Out[140]:array([[0. , 0. ], [0.25, 0.25], [0.5 , 0.5 ], [1. , 1. ]])
In [142]:scaler.transform([[4,5]])
Out[142]:array([[2.5 , 0.1875]])
In [3]:import pandas as pd from scipy import stats
In [82]:df = pd.read_excel('./FTA무역통계_20230109000036.xls', header=4, index_col=1)
C:\ProgramData\Anaconda3\lib\site-packages\openpyxl\styles\stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default")
In [5]:df
Out[5]:기간수출건수수출금액수입건수수입금액무역수지FTA명RCEP한-EFTA한-EU한-뉴질랜드한-미국한-베트남한-싱가포르한-아세안한-영국한-인도한-중국한-중미한-칠레한-캐나다한-콜롬비아한-터키한-페루한-호주2022 6,474,383 287,564,087 11,944,545 292,113,146 -4,549,059 2022 26,912 1,794,418 75,489 4,810,598 -3,016,180 2022 882,224 67,684,624 4,734,145 66,716,800 967,825 2022 36,305 3,139,637 455,563 1,743,951 1,395,687 2022 2,186,357 100,446,018 18,531,680 74,972,585 25,473,433 2022 582,908 56,212,924 860,372 24,850,108 31,362,815 2022 749,118 18,696,289 151,934 9,315,582 9,380,706 2022 2,775,570 115,599,623 1,839,987 75,744,022 39,855,602 2022 108,368 5,862,160 386,507 5,374,130 488,030 2022 193,395 17,391,372 182,655 8,403,685 8,987,687 2022 1,325,620 144,611,224 6,775,116 142,753,918 1,857,306 2022 28,606 1,388,730 10,003 933,363 455,367 2022 30,137 1,425,386 16,553 6,264,483 -4,839,097 2022 212,405 7,224,014 329,449 7,966,998 -742,985 2022 14,045 862,924 11,286 636,784 226,140 2022 82,688 7,059,423 46,380 1,287,251 5,772,172 2022 13,929 721,030 10,024 2,734,129 -2,013,099 2022 191,729 17,396,124 560,089 41,235,660 -23,839,536 In [6]:df['수출건수'][0]
Out[6]:' 6,474,383'
In [83]:df['수출건수'] = [int(i.replace(' ','').replace(',','')) for i in df['수출건수']] df['수입건수'] = [int(i.replace(' ','').replace(',','')) for i in df['수입건수']]
In [8]:df
Out[8]:기간수출건수수출금액수입건수수입금액무역수지FTA명RCEP한-EFTA한-EU한-뉴질랜드한-미국한-베트남한-싱가포르한-아세안한-영국한-인도한-중국한-중미한-칠레한-캐나다한-콜롬비아한-터키한-페루한-호주2022 6474383 287,564,087 11944545 292,113,146 -4,549,059 2022 26912 1,794,418 75489 4,810,598 -3,016,180 2022 882224 67,684,624 4734145 66,716,800 967,825 2022 36305 3,139,637 455563 1,743,951 1,395,687 2022 2186357 100,446,018 18531680 74,972,585 25,473,433 2022 582908 56,212,924 860372 24,850,108 31,362,815 2022 749118 18,696,289 151934 9,315,582 9,380,706 2022 2775570 115,599,623 1839987 75,744,022 39,855,602 2022 108368 5,862,160 386507 5,374,130 488,030 2022 193395 17,391,372 182655 8,403,685 8,987,687 2022 1325620 144,611,224 6775116 142,753,918 1,857,306 2022 28606 1,388,730 10003 933,363 455,367 2022 30137 1,425,386 16553 6,264,483 -4,839,097 2022 212405 7,224,014 329449 7,966,998 -742,985 2022 14045 862,924 11286 636,784 226,140 2022 82688 7,059,423 46380 1,287,251 5,772,172 2022 13929 721,030 10024 2,734,129 -2,013,099 2022 191729 17,396,124 560089 41,235,660 -23,839,536 In [9]:df['수출건수'], df['수입건수']
Out[9]:(FTA명 RCEP 6474383 한-EFTA 26912 한-EU 882224 한-뉴질랜드 36305 한-미국 2186357 한-베트남 582908 한-싱가포르 749118 한-아세안 2775570 한-영국 108368 한-인도 193395 한-중국 1325620 한-중미 28606 한-칠레 30137 한-캐나다 212405 한-콜롬비아 14045 한-터키 82688 한-페루 13929 한-호주 191729 Name: 수출건수, dtype: int64, FTA명 RCEP 11944545 한-EFTA 75489 한-EU 4734145 한-뉴질랜드 455563 한-미국 18531680 한-베트남 860372 한-싱가포르 151934 한-아세안 1839987 한-영국 386507 한-인도 182655 한-중국 6775116 한-중미 10003 한-칠레 16553 한-캐나다 329449 한-콜롬비아 11286 한-터키 46380 한-페루 10024 한-호주 560089 Name: 수입건수, dtype: int64)
In [10]:# ttest_ind = T-test = Student's T-test stats.ttest_ind(df['수출건수'],df['수입건수'])
Out[10]:Ttest_indResult(statistic=-1.3740039439389153, pvalue=0.17843151065870033)
0113 추가 실습
- 위 T-test를 다시 실행한다
- 단, 데이터 전처리와 transformation을 시도해 본다
In [31]:import math import numpy as np import seaborn as sns
In [37]:sns.distplot(df['수출건수']) # 분포 확인
Out[37]:<AxesSubplot:xlabel='수출건수', ylabel='Density'>
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 49688 (\N{HANGUL SYLLABLE SU}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 52636 (\N{HANGUL SYLLABLE CUL}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44148 (\N{HANGUL SYLLABLE GEON}) missing from current font. fig.canvas.print_figure(bytes_io, **kw)
In [ ]:np.exp(df['수출건수']) # 지수 변환
In [ ]:np.log2(df['수출건수']) # 로그 변환
In [30]:df['수출건수'].apply(lambda x: math.log2(x))
Out[30]:FTA명 RCEP 4.499929 한-EFTA 3.879310 한-EU 4.303838 한-뉴질랜드 3.921044 한-미국 4.396440 한-베트남 4.259492 한-싱가포르 4.286499 한-아세안 4.419832 한-영국 4.063984 한-인도 4.134319 한-중국 4.346123 한-중미 3.887918 한-칠레 3.895230 한-캐나다 4.145389 한-콜롬비아 3.784270 한-터키 4.029929 한-페루 3.783017 한-호주 4.133293 Name: 수출건수, dtype: float64
In [ ]:In [ ]:In [ ]:In [11]:stats.ttest_ind(df['수출건수'],df['수입건수'])
Out[11]:Ttest_indResult(statistic=-1.3740039439389153, pvalue=0.17843151065870033)
In [12]:stats.shapiro(df['수출건수'])
Out[12]:ShapiroResult(statistic=0.6007808446884155, pvalue=6.888485586387105e-06)
In [13]:stats.shapiro(df['수입건수'])
Out[13]:ShapiroResult(statistic=0.5887835025787354, pvalue=5.249044534139102e-06)
In [20]:import seaborn as sns import math import numpy as np
In [49]:sns.distplot(df['수입건수'].apply(lambda x: math.log2(x)))
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
Out[49]:<AxesSubplot:xlabel='수입건수', ylabel='Density'>
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 49688 (\N{HANGUL SYLLABLE SU}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 51077 (\N{HANGUL SYLLABLE IB}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44148 (\N{HANGUL SYLLABLE GEON}) missing from current font. fig.canvas.print_figure(bytes_io, **kw)
In [21]:sns.distplot(np.log2(df['수입건수']))
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
Out[21]:<AxesSubplot:xlabel='수입건수', ylabel='Density'>
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 49688 (\N{HANGUL SYLLABLE SU}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 51077 (\N{HANGUL SYLLABLE IB}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44148 (\N{HANGUL SYLLABLE GEON}) missing from current font. fig.canvas.print_figure(bytes_io, **kw)
In [50]:stats.shapiro(df['수출건수'].apply(lambda x: math.log2(x)))
Out[50]:ShapiroResult(statistic=0.944760262966156, pvalue=0.3488025963306427)
In [51]:stats.shapiro(np.log2(df['수입건수']))
Out[51]:ShapiroResult(statistic=0.9464555978775024, pvalue=0.37215086817741394)
In [52]:# log2 transformation 기능을 수행하는 함수 df['수출건수'] = df['수출건수'].apply(lambda x: math.log2(x)) df['수입건수'] = np.log2(df['수입건수'])
In [53]:# KS test: 두 그룹이 같은 분산성을 가지는 모집단에서 추출되었는지 확인 # 귀무가설: 같은 분산성을 가지는 모집단에서 추출되었다 # 대립가설: 같은 분산성을 가지는 모집단에서 추출되지 않았다 stats.ks_2samp(df['수출건수'], df['수입건수'])
Out[53]:KstestResult(statistic=0.16666666666666666, pvalue=0.9715397823325015)
In [55]:# levene test: 두 그룹이 등분산성인지 확인 # 귀무가설: 두 그룹이 등분산성이다 # 대립가설: 두 그룹이 등분산성이 아니다 = 이분산성이다 stats.levene(df['수출건수'], df['수입건수'])
Out[55]:LeveneResult(statistic=0.7822644136954173, pvalue=0.3826615588571587)
In [56]:# T-test (등분산성) # Student’s T-test의 귀무가설: 두 그룹은 같은 모평균을 가진다 stats.ttest_ind(df['수출건수'], df['수입건수'], equal_var=True)
Out[56]:Ttest_indResult(statistic=-0.6030815594484242, pvalue=0.5504565242174382)
In [84]:# 정규성을 만족하지 않을 때, 정규성을 가지도록 변환할 것인가 비모수적인 방법을 이용할 것인가 # 변환의 정도가 크지 않다면 전자, 크다면 후자를 이용한다 # 변환의 정도는 주관적으로 판단한다 stats.mannwhitneyu(df['수출건수'], df['수입건수'])
Out[84]:MannwhitneyuResult(statistic=151.0, pvalue=0.7397342431142654)
T-test 절차
- 전처리
- 정규성 검정 (Shapiro-Wilk test, p < 0.05일 경우 정규성이 없는 것으로 볼 수 있다)
- 정규성이 없을 경우 -> transformation을 시도한다 (해당 값을 X로 가정한다)
- X에 대해 등분산성 검정 (Levene test, p < 0.05일 경우 이분산성으로 볼 수 있다)
- X에 대해 적절한 T-test 기법 선택 후 시행 (p < 0.05일 경우 두 그룹의 모평균이 다르다고 볼 수 있다
- scale이 지나치게 크거나 작을 경우, 조절해 주는 것을 권장
실습
- 아래 데이터를 이용해 통계분석을 해 본다
- 물건금액 column을 이용
- 특정 구역과 다른 구역 사이의 관계 또는 특성을 알아본다
- 통계분석에 사용되는 데이터는 모두 전처리를 한다
- 샘플 추출이 필요한 경우, 무작위로 추출한다
In [89]:import random
In [ ]:# 랜덤 샘플링을 위한 기능 # np.random.randint() # random 패키지
In [85]:df = pd.read_csv('./서울시 부동산 실거래가 정보.csv', encoding='cp949')
C:\Users\user\AppData\Local\Temp\ipykernel_13608\1027994529.py:1: DtypeWarning: Columns (7,19,20) have mixed types. Specify dtype option on import or set low_memory=False. df = pd.read_csv('./서울시 부동산 실거래가 정보.csv', encoding='cp949')
In [88]:df.describe()['물건금액(만원)']
Out[88]:count 6.400000e+05 mean 6.121905e+04 std 8.540087e+04 min 1.700000e+03 25% 2.300000e+04 50% 3.650000e+04 75% 7.300000e+04 max 1.108778e+07 Name: 물건금액(만원), dtype: float64
In [90]:sns.distplot(df['물건금액(만원)'])
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
Out[90]:<AxesSubplot:xlabel='물건금액(만원)', ylabel='Density'>
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 47932 (\N{HANGUL SYLLABLE MUL}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44148 (\N{HANGUL SYLLABLE GEON}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44552 (\N{HANGUL SYLLABLE GEUM}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 50529 (\N{HANGUL SYLLABLE AEG}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 47564 (\N{HANGUL SYLLABLE MAN}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 50896 (\N{HANGUL SYLLABLE WEON}) missing from current font. fig.canvas.print_figure(bytes_io, **kw)
In [91]:sns.distplot(np.log2(df['물건금액(만원)']))
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
Out[91]:<AxesSubplot:xlabel='물건금액(만원)', ylabel='Density'>
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 47932 (\N{HANGUL SYLLABLE MUL}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44148 (\N{HANGUL SYLLABLE GEON}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44552 (\N{HANGUL SYLLABLE GEUM}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 50529 (\N{HANGUL SYLLABLE AEG}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 47564 (\N{HANGUL SYLLABLE MAN}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 50896 (\N{HANGUL SYLLABLE WEON}) missing from current font. fig.canvas.print_figure(bytes_io, **kw)
In [92]:df['물건금액(만원)'].sort_values()
Out[92]:469341 1700 537486 2000 450513 2000 452996 2000 586391 2000 ... 213608 11087780 191607 11087780 213454 11087780 191588 11087780 191967 11087780 Name: 물건금액(만원), Length: 640000, dtype: int64
In [94]:df['물건금액(만원)'].sort_values()[-10:]
Out[94]:224778 3000000 226055 3000000 213821 11087780 191319 11087780 213661 11087780 213608 11087780 191607 11087780 213454 11087780 191588 11087780 191967 11087780 Name: 물건금액(만원), dtype: int64
In [95]:stats.shapiro(np.log2(df['물건금액(만원)']))
C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\_morestats.py:1800: UserWarning: p-value may not be accurate for N > 5000. warnings.warn("p-value may not be accurate for N > 5000.")
Out[95]:ShapiroResult(statistic=0.9840511679649353, pvalue=0.0)
In [140]:# sample 개수가 많아 정규성을 만족한다고 가정할지 # log2 transformation을 하는 것으로 결정할지 자의적으로 판단 df['물건금액(만원)'] = np.log2(df['물건금액(만원)'])
In [141]:df.shape
Out[141]:(640000, 21)
In [123]:data1 = pd.DataFrame() data2 = pd.DataFrame()
In [124]:for i in range(200): r = np.random.randint(0,df.shape[0]) r2 = np.random.randint(0,df.shape[0]) data1[i] = df.loc[r] data2[i] = df.loc[r2]
In [125]:data1 = data1.T data2 = data2.T
In [142]:stats.shapiro(data1['물건금액(만원)'])
Out[142]:ShapiroResult(statistic=0.9806092977523804, pvalue=0.007261964492499828)
In [143]:sns.distplot(data1['물건금액(만원)'])
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
Out[143]:<AxesSubplot:xlabel='물건금액(만원)', ylabel='Density'>
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 47932 (\N{HANGUL SYLLABLE MUL}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44148 (\N{HANGUL SYLLABLE GEON}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44552 (\N{HANGUL SYLLABLE GEUM}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 50529 (\N{HANGUL SYLLABLE AEG}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 47564 (\N{HANGUL SYLLABLE MAN}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 50896 (\N{HANGUL SYLLABLE WEON}) missing from current font. fig.canvas.print_figure(bytes_io, **kw)
In [128]:stats.levene(data1['물건금액(만원)'], data2['물건금액(만원)'])
Out[128]:LeveneResult(statistic=0.038709411342177195, pvalue=0.8441258974314072)
In [135]:stats.ttest_ind(data1['물건금액(만원)'].apply(lambda x: float(x)), data2['물건금액(만원)'].apply(lambda x: float(x)))
Out[135]:Ttest_indResult(statistic=-0.3327522991505876, pvalue=0.739496466909871)
In [ ]:In [86]:df
Out[86]:접수연도자치구코드자치구명법정동코드법정동명지번구분지번구분명본번부번건물명...물건금액(만원)건물면적(㎡)토지면적(㎡)층권리구분취소일건축년도건물용도신고구분신고한 개업공인중개사 시군구명01234...6399956399966399976399986399992023 11710 송파구 10600 삼전동 1.0 대지 43.0 3.0 더트라이앵글 ... 27000 16.59 12.15 3.0 NaN NaN 2021.0 연립다세대 중개거래 서울 송파구 2023 11140 중구 16100 저동2가 1.0 대지 7.0 2.0 제이매크로 타워 ... 20300 19.35 31.04 9.0 NaN NaN 2018.0 오피스텔 중개거래 서울 중구 2023 11260 중랑구 10300 중화동 1.0 대지 295.0 38.0 대일빌라 ... 13500 44.00 27.58 2.0 NaN NaN 1994.0 연립다세대 중개거래 서울 중랑구 2023 11740 강동구 10900 천호동 1.0 대지 425.0 3.0 천호역 한강 푸르지오시티 ... 18800 25.24 35.91 28.0 NaN NaN 2015.0 오피스텔 중개거래 서울 강동구 2023 11380 은평구 10800 역촌동 1.0 대지 80.0 2.0 임창에버빌(80-2) ... 22500 40.48 15.96 6.0 NaN NaN 2011.0 연립다세대 중개거래 서울 은평구 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 2019 11320 도봉구 10500 쌍문동 1.0 대지 56.0 0.0 삼익세라믹 ... 22000 42.93 NaN 8.0 NaN NaN 1988.0 아파트 NaN NaN 2019 11290 성북구 13400 길음동 1.0 대지 1278.0 0.0 길음동동부센트레빌(1278-0) ... 49800 60.00 NaN 8.0 NaN NaN 2003.0 아파트 NaN NaN 2019 11320 도봉구 10600 방학동 1.0 대지 271.0 1.0 신동아아파트1 ... 25000 53.16 NaN 15.0 NaN NaN 1986.0 아파트 NaN NaN 2019 11305 강북구 10300 수유동 1.0 대지 169.0 5.0 드림하이빌(169-5) ... 19500 43.88 27.20 5.0 NaN NaN 2015.0 연립다세대 NaN NaN 2019 11380 은평구 10900 신사동 1.0 대지 261.0 20.0 성락타운(261-20) ... 20250 49.35 45.72 1.0 NaN NaN 1987.0 연립다세대 NaN NaN 640000 rows × 21 columns
In [77]:math.log2(10)
Out[77]:3.321928094887362
In [78]:math.log2(1000000000000)
Out[78]:39.86313713864835
In [79]:math.log2(10000000000000)
Out[79]:43.18506523353571
In [ ]:In [ ]:In [ ]:p-value 표기법
- p < 0.05 == *
- p < 0.01 == **
- p < 0.001 == ***
- p < 0.0001 == **
- p >= 0.05 == ns
In [188]:# 정규성 검정 # 귀무가설: 정규성을 만족하는 모집단에서 나왔다 # p < 0.05 이므로 정규성을 만족하지 않는다 라고 볼 수 있음 stats.shapiro(df['수출건수'])
Out[188]:ShapiroResult(statistic=0.6007808446884155, pvalue=6.888485586387105e-06)
In [189]:import seaborn as sns
In [190]:sns.distplot(df['수출건수'])
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
Out[190]:<AxesSubplot:xlabel='수출건수', ylabel='Density'>
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 49688 (\N{HANGUL SYLLABLE SU}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 52636 (\N{HANGUL SYLLABLE CUL}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44148 (\N{HANGUL SYLLABLE GEON}) missing from current font. fig.canvas.print_figure(bytes_io, **kw)
In [200]:# 귀무가설: 두 그룹의 차이가 나지 않는다 # p >= 0.05 이기 때문에 두 그룹 간의 차이가 나지 않는다고 볼 수 있음 stats.wilcoxon(df['수출건수'],df['수입건수'])
Out[200]:WilcoxonResult(statistic=48.0, pvalue=0.1083831787109375)
In [201]:# KS test # 귀무가설: 같은 분산을 가진 표본에서 추출한 것이다 (등분산성이라고 가정할 수 있음) # p >= 0.05 이기 때문에 등분산성임을 확인 stats.ks_2samp(df['수출건수'],df['수입건수'])
Out[201]:KstestResult(statistic=0.16666666666666666, pvalue=0.9715397823325015)
In [202]:# Welch’s T-test # Student's T-test에서 equal_var 옵션을 False 로 설정하면 됩니다 stats.ttest_ind(df['수출건수'],df['수입건수'], equal_var=False)
Out[202]:Ttest_indResult(statistic=-1.3740039439389151, pvalue=0.18436372922684297)
실습
- 아래의 데이터에서 각각 T-test를 진행한다
- T-test를 진행할 때, 결과가 특정 주장을 할 수 있어야 합니다
- FTA 데이터 예시) A/B 그룹의 수출금액이 서로 다른지 비교할 경우, 그 원인을 찾아 주장할 수 있다
In [236]:df1 = pd.read_excel('./FTA무역통계_20230109000036.xls', index_col=1, header=4)
C:\ProgramData\Anaconda3\lib\site-packages\openpyxl\styles\stylesheet.py:226: UserWarning: Workbook contains no default style, apply openpyxl's default warn("Workbook contains no default style, apply openpyxl's default")
In [206]:df2 = pd.read_csv('./서울시 부동산 실거래가 정보.csv', encoding='cp949')
C:\Users\user\AppData\Local\Temp\ipykernel_18480\496141101.py:1: DtypeWarning: Columns (7,19,20) have mixed types. Specify dtype option on import or set low_memory=False. df2 = pd.read_csv('./서울시 부동산 실거래가 정보.csv', encoding='cp949')
In [237]:유럽 = ['한-EFTA','한-EU','한-영국'] 아시아 = ['RCEP','한-베트남','한-싱가포르','한-아세안','한-인도','한-중국'] 아메리카 = ['한-미국','한-중미','한-칠레','한-캐나다','한-콜롬비아','한-페루'] 서아시아 = ['한-터키'] 오세아니아 = ['한-호주','한-뉴질랜드']
In [238]:df1['수입금액'] = [int(i.replace(' ','').replace(',','')) for i in df1['수입금액']]
In [239]:df1.loc[유럽]
Out[239]:기간수출건수수출금액수입건수수입금액무역수지FTA명한-EFTA한-EU한-영국2022 26,912 1,794,418 75,489 4810598 -3,016,180 2022 882,224 67,684,624 4,734,145 66716800 967,825 2022 108,368 5,862,160 386,507 5374130 488,030 In [240]:In [ ]:# 수입금액을 아시아-아메리카 분류로 비교
In [241]:stats.shapiro(df1['수입금액'])
Out[241]:ShapiroResult(statistic=0.6233973503112793, pvalue=1.1650753549474757e-05)
In [242]:import matplotlib.pyplot as plt stats.probplot(df1['수입금액'], plot=plt)
Out[242]:((array([-1.77709673, -1.33087857, -1.05345661, -0.83977496, -0.6589352 , -0.49749478, -0.34812942, -0.20618578, -0.06829777, 0.06829777, 0.20618578, 0.34812942, 0.49749478, 0.6589352 , 0.83977496, 1.05345661, 1.33087857, 1.77709673]), array([ 636784, 933363, 1287251, 1743951, 2734129, 4810598, 5374130, 6264483, 7966998, 8403685, 9315582, 24850108, 41235660, 66716800, 74972585, 75744022, 142753918, 292113146], dtype=int64)), (60201229.04824289, 42658732.94444444, 0.7760322824492758))
In [243]:sns.distplot(df1['수입금액'])
C:\ProgramData\Anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
Out[243]:<AxesSubplot:xlabel='수입금액', ylabel='Density'>
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 49688 (\N{HANGUL SYLLABLE SU}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 51077 (\N{HANGUL SYLLABLE IB}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 44552 (\N{HANGUL SYLLABLE GEUM}) missing from current font. fig.canvas.print_figure(bytes_io, **kw) C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning: Glyph 50529 (\N{HANGUL SYLLABLE AEG}) missing from current font. fig.canvas.print_figure(bytes_io, **kw)
In [248]:stats.ks_2samp(df1.loc[아시아,'수입금액'],df1.loc[아메리카,'수입금액'])
Out[248]:KstestResult(statistic=0.8333333333333334, pvalue=0.025974025974025972)
In [249]:stats.wilcoxon(df1.loc[아시아,'수입금액'],df1.loc[아메리카,'수입금액'])
Out[249]:WilcoxonResult(statistic=0.0, pvalue=0.03125)
In [250]:stats.mannwhitneyu(df1.loc[아시아,'수입금액'],df1.loc[아메리카,'수입금액'])
Out[250]:MannwhitneyuResult(statistic=33.0, pvalue=0.015151515151515152)
In [252]:df1.loc[아시아,'수입금액'].mean() > df1.loc[아메리카,'수입금액'].mean()
Out[252]:True
In [270]:df2[['건물면적(㎡)','토지면적(㎡)']]
Out[270]:건물면적(㎡)토지면적(㎡)01234...63999563999663999763999863999916.59 12.150000 19.35 31.040000 44.00 27.580000 25.24 35.910000 40.48 15.960000 ... ... 42.93 35.520121 60.00 35.520121 53.16 35.520121 43.88 27.200000 49.35 45.720000 640000 rows × 2 columns
In [271]:stats.shapiro(df2['건물면적(㎡)'])
C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\_morestats.py:1800: UserWarning: p-value may not be accurate for N > 5000. warnings.warn("p-value may not be accurate for N > 5000.")
Out[271]:ShapiroResult(statistic=0.4947553873062134, pvalue=0.0)
In [272]:stats.shapiro(df2['토지면적(㎡)'])
C:\ProgramData\Anaconda3\lib\site-packages\scipy\stats\_morestats.py:1800: UserWarning: p-value may not be accurate for N > 5000. warnings.warn("p-value may not be accurate for N > 5000.")
Out[272]:ShapiroResult(statistic=0.3813157081604004, pvalue=0.0)
In [262]:df2['토지면적(㎡)'].fillna(df2['토지면적(㎡)'].mean(), inplace=True)
In [264]:stats.ks_2samp(df2['건물면적(㎡)'], df2['토지면적(㎡)'])
Out[264]:KstestResult(statistic=0.49686874999999997, pvalue=0.0)
In [275]:stats.ttest_ind(df2['건물면적(㎡)'], df2['토지면적(㎡)'], equal_var=False)
Out[275]:Ttest_indResult(statistic=272.40307457631644, pvalue=0.0)
In [269]:df2['건물면적(㎡)'].mean() > df2['토지면적(㎡)'].mean()
Out[269]:True
In [ ]: