Pandas Interoperability
• 37 min read
#code adapted from https://github.com/thomasjpfan/ml-workshop-intermediate-1-of-2
import pandas as pd
url = 'https://raw.githubusercontent.com/davidrkearney/Kearney_Data_Science/master/_notebooks/df_panel_fix.csv'
df = pd.read_csv(url, error_bad_lines=False)
df
import pandas as pd
import sklearn
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
df.columns
sklearn.set_config(display='diagram')
df=df.dropna()
df.isnull().sum()
X, y = df.drop(['specific', 'Unnamed: 0'], axis = 1), df['specific']
X
_ = X.hist(figsize=(30, 15), layout=(5, 8))
import numpy as np
import sklearn
sklearn.set_config(display='diagram')
from sklearn.preprocessing import OrdinalEncoder
ord_encoder = OrdinalEncoder()
ord_encoder.fit_transform(df)
array([[ 0., 0., 17., ..., 24., 0., 28.], [ 1., 0., 48., ..., 29., 0., 60.], [ 2., 0., 64., ..., 40., 0., 67.], ..., [115., 27., 44., ..., 97., 0., 53.], [116., 27., 84., ..., 4., 0., 64.], [117., 27., 78., ..., 25., 0., 71.]])
ord_encoder.categories_
[array([ 4, 6, 7, 10, 11, 16, 18, 19, 22, 23, 34, 35, 40, 42, 43, 46, 47, 52, 54, 58, 64, 66, 67, 70, 71, 76, 78, 79, 82, 83, 88, 90, 91, 94, 95, 107, 112, 119, 124, 126, 127, 130, 131, 136, 138, 139, 142, 143, 148, 150, 151, 154, 155, 160, 162, 163, 166, 167, 172, 174, 175, 178, 179, 184, 186, 187, 190, 191, 196, 198, 199, 202, 203, 220, 222, 223, 226, 227, 232, 234, 235, 239, 244, 246, 247, 250, 251, 258, 259, 262, 263, 280, 282, 283, 292, 294, 295, 298, 310, 316, 318, 319, 322, 323, 328, 330, 331, 334, 335, 340, 342, 343, 346, 347, 354, 355, 358, 359]), array(['Anhui', 'Beijing', 'Chongqing', 'Fujian', 'Gansu', 'Guangdong', 'Guangxi', 'Guizhou', 'Hainan', 'Hebei', 'Heilongjiang', 'Henan', 'Hubei', 'Hunan', 'Jiangsu', 'Jiangxi', 'Jilin', 'Ningxia', 'Qinghai', 'Shaanxi', 'Shandong', 'Shanxi', 'Sichuan', 'Tianjin', 'Tibet', 'Xinjiang', 'Yunnan', 'Zhejiang'], dtype=object), array([ 72939., 91405., 96825., 103041., 107046., 107687., 119536., 127819., 133858., 137190., 142650., 144839., 147749., 148812., 160637., 179235., 181409., 195580., 207615., 217707., 223984., 228043., 237948., 248903., 251539., 254002., 262197., 265770., 271297., 271499., 281769., 295133., 319710., 331999., 337894., 340372., 353776., 354775., 359275., 365437., 367568., 369552., 370049., 379186., 391292., 395775., 397517., 430577., 434149., 436189., 447709., 458201., 469514., 472786., 484715., 487155., 510656., 518022., 531994., 558569., 575550., 591088., 601485., 615593., 619201., 642581., 675931., 681676., 684767., 685732., 688887., 714572., 752279., 753552., 761081., 821750., 833423., 833430., 844647., 859482., 875877., 909559., 978069., 985851., 1017303., 1035872., 1097470., 1188989., 1204547., 1207353., 1224179., 1235386., 1246484., 1315102., 1331590., 1358528., 1388043., 1457872., 1550764., 1562694., 1791403., 1890650., 1897575., 1956261., 2018158., 2022957., 2045869., 2121243., 2213991., 2225220., 2482173., 2663667., 2669238., 2922784., 2981235., 3156087., 3847672., 3860764.]), array([ 0., 2990., 8115., 11755., 11767., 17400., 20842., 26300., 27302., 27387., 29646., 30705., 32100., 32119., 32868., 34842., 36670., 36946., 40604., 44623., 45683., 50097., 52108., 53900., 56070., 58533., 59263., 60560., 60906., 62000., 66100., 66529., 68142., 71807., 80361., 80609., 81879., 86256., 88007., 93323., 95648., 100000., 100900., 107658., 108624., 112137., 113000., 114418., 116000., 119658., 123317., 123546., 124582., 124647., 129791., 135765., 143800., 145000., 149549., 150000., 153640., 154364., 165071., 173552., 173556., 178705., 179252., 188633., 197539., 229895., 241282., 260313., 264185., 269596., 279052., 280277., 302600., 309582., 317700., 320627., 321686., 363054., 394795., 400000., 405966., 423049., 429591., 434318., 447900., 460668., 498913., 516342., 527300., 540479., 564400., 570723., 581800., 605400., 655919., 659400., 694400., 763953., 1016400., 1023453., 1046700., 1081000., 1131615., 1187958., 1197400., 1214100., 1239200., 1263500., 1272600., 1329200., 1737800.]), array([2000, 2002, 2003, 2006, 2007]), array([ 117.8 , 162.04, 185.09, 263.68, 290.76, 295.02, 340.65, 341.43, 377.16, 390.2 , 445.36, 725.9 , 797.35, 919.11, 1029.92, 1052.88, 1232.03, 1243.43, 1254.17, 1363.56, 1426.34, 1612.65, 1672.96, 1804. , 1845.72, 1853.65, 1886.35, 2011.19, 2080.04, 2120.35, 2175.68, 2253.39, 2277.35, 2312.82, 2324.8 , 2338.98, 2348.54, 2450.48, 2523.73, 2556.02, 2587.72, 2821.11, 2855.23, 2884.11, 2902.09, 3045.26, 3151.4 , 3161.66, 3519.72, 3523.16, 3545.39, 3551.49, 3620.27, 3637.2 , 3764.54, 3907.23, 3923.11, 3928.2 , 3988.14, 4056.76, 4057.4 , 4151.54, 4212.82, 4275.12, 4315. , 4462.74, 4467.55, 4659.99, 4676.13, 4725.01, 4743.61, 4746.16, 4757.45, 4772.52, 4820.53, 4983.67, 5007.21, 5043.96, 5052.99, 5333.09, 5757.29, 5823.41, 6035.48, 6112.5 , 6211.8 , 6867.7 , 7104. , 7360.92, 7583.85, 7617.47, 7688.67, 7697.82, 8003.67, 8117.78, 8690.24, 9248.53, 9333.4 , 9439.6 , 9456.84, 9705.02, 9846.81, 10275.5 , 10606.85, 10741.25, 12078.15, 12362.79, 13502.42, 13607.32, 15012.46, 15718.47, 15844.64, 18598.69, 18753.73, 21742.05, 21900.19, 25776.91, 26587.76, 31777.01]), array([ 2, 293, 467, 1522, 1534, 1741, 1743, 1899, 1911, 2200, 2418, 2501, 2522, 2954, 3718, 3821, 4521, 4726, 5047, 6121, 6235, 8384, 9384, 10366, 11020, 11169, 12484, 12651, 12812, 21164, 21361, 22472, 24468, 28842, 30086, 30120, 30234, 31000, 31847, 32080, 32180, 33190, 33766, 35511, 36005, 36720, 38375, 39453, 39575, 40463, 41231, 41726, 41856, 43694, 44740, 52466, 53903, 55583, 56403, 66100, 67833, 67923, 68396, 69595, 76064, 90022, 92489, 94368, 101835, 108197, 108534, 112001, 119516, 120819, 139354, 142665, 156886, 168368, 170801, 172464, 184526, 208508, 219126, 241621, 242000, 244853, 259335, 259903, 276622, 280657, 299892, 306162, 307610, 322047, 327051, 343191, 383837, 406058, 413077, 455191, 473404, 498055, 506572, 601617, 607756, 691482, 782294, 888935, 1000069, 1018960, 1036576, 1101159, 1128091, 1133400, 1318339, 1451065, 1712603, 1743140]), array([0. , 0.02702703, 0.03 , 0.03125 , 0.04761905, 0.09677419, 0.20512821, 0.22 , 0.4 , 1.21428571]), array([0. , 0.03 , 0.03571429, 0.10869565, 0.11111111, 0.13 , 0.13888889, 0.15384615, 0.16 , 0.24 , 0.27027027, 0.3 , 0.31 , 0.4 , 0.41025641, 0.4375 , 0.5 , 0.7948718 ]), array([0. , 0.03571429, 0.05128205, 0.12820513, 0.13 , 0.21621622, 0.22222222, 0.23076923, 0.27586207, 0.3 , 0.32432432, 0.4 , 0.40625 , 0.47 , 0.51612903, 0.53 , 0.55 , 0.71052632, 0.8125 ]), array(['1060812', '1082935', '1089674', '1108348', '11537149', '1163113', '11673659', '118013', '1212843', '123888', '1292604', '1310512', '1321004', '1389153', '1443753', '147235', '14740022', '14926380', '1514364', '1514799', '1543658', '1548155', '157652', '1600475', '1601508', '16494981', '1667114', '16753980', '1675757', '1677840', '16804703', '169770', '1710605', '1723026', '1755299', '1762409', '1802055', '1807967', '1841592', '1851377', '1896479', '1913563', '1925862', '1938812', '201412', '2018672', '2024337', '202761', '2110577', '2125369', '2195820', '22377276', '2308652', '2329505', '233299', '2373047', '2419708', '2450874', '2511249', '2523352', '2525301', '2567976', '2648861', '27858007', '2823366', '2823413', '2851375', '2858600', '2972212', '3206892', '3434548', '3444533', '3816261', '3898510', '4032810', '4188265', '4247403', '4404689', '4427000', '4468640', '447643', '4752398', '4830320', '4830392', '4867146', '4958329', '505196', '50819', '5145006', '5596906', '567083', '5903552', '597159', '59841', '6065508', '6166904', '6212824', '6217715', '6879383', '693750', '6994577', '70048', '7071605', '740947', '776120', '7891198', '800312', '830159', '8620804', '8818088', '919235', '924080', '932549', '960708', '966606', '971485', '974325', '9898522'], dtype=object), array(['East China', 'North China', 'Northeast China', 'Northwest China', 'South Central China', 'Southwest China'], dtype=object), array([ 475184, 546541, 632880, 736165, 757990, 819028, 866691, 948521, 976396, 1047698, 1078754, 1109537, 1174622, 1184990, 1210637, 1216605, 1228569, 1258100, 1308445, 1333133, 1364344, 1364980, 1423771, 1426600, 1428990, 1440939, 1472622, 1492835, 1499110, 1554999, 1648826, 1658350, 1742585, 1782317, 1845611, 1873822, 1898911, 1927102, 1962192, 1962633, 1986738, 2017594, 2023674, 2047192, 2052220, 2053980, 2072426, 2135224, 2138158, 2138758, 2143190, 2150325, 2254281, 2261631, 2268499, 2339769, 2347862, 2355164, 2376983, 2378616, 2404936, 2444270, 2455900, 2545841, 2553268, 2649011, 2764053, 2815820, 2867525, 2907301, 2926542, 2939778, 2940367, 2977880, 3035767, 3051103, 3101537, 3114638, 3124234, 3343228, 3388449, 3545004, 3557071, 3586373, 3847158, 3893879, 3923569, 4039036, 4062020, 4073606, 4133488, 4229821, 4390259, 4559252, 4607955, 4613724, 4686125, 4947824, 5046865, 5167300, 5304833, 5502470, 5639838, 6003791, 6033279, 6185600, 6308151, 6349262, 6357869, 6832541, 7040099, 7537692, 7601825, 7646885, 7666512, 7968319, 8340692, 10533312])]
ord_encoder.transform(df)
array([[ 0., 0., 17., ..., 24., 0., 28.], [ 1., 0., 48., ..., 29., 0., 60.], [ 2., 0., 64., ..., 40., 0., 67.], ..., [115., 27., 44., ..., 97., 0., 53.], [116., 27., 84., ..., 4., 0., 64.], [117., 27., 78., ..., 25., 0., 71.]])
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
X_trans = ohe.fit_transform(df)
X_trans
<118x909 sparse matrix of type '<class 'numpy.float64'>' with 1534 stored elements in Compressed Sparse Row format>
X_trans.toarray()
array([[1., 0., 0., ..., 0., 0., 0.], [0., 1., 0., ..., 0., 0., 0.], [0., 0., 1., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]])
ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(df)
array([[1., 0., 0., ..., 0., 0., 0.], [0., 1., 0., ..., 0., 0., 0.], [0., 0., 1., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]])
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe.fit(df)
OneHotEncoder(handle_unknown='ignore', sparse=False)
ohe.transform(df)
array([[1., 0., 0., ..., 0., 0., 0.], [0., 1., 0., ..., 0., 0., 0.], [0., 0., 1., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 0., 0.]])
ohe.categories_
[array([ 4, 6, 7, 10, 11, 16, 18, 19, 22, 23, 34, 35, 40, 42, 43, 46, 47, 52, 54, 58, 64, 66, 67, 70, 71, 76, 78, 79, 82, 83, 88, 90, 91, 94, 95, 107, 112, 119, 124, 126, 127, 130, 131, 136, 138, 139, 142, 143, 148, 150, 151, 154, 155, 160, 162, 163, 166, 167, 172, 174, 175, 178, 179, 184, 186, 187, 190, 191, 196, 198, 199, 202, 203, 220, 222, 223, 226, 227, 232, 234, 235, 239, 244, 246, 247, 250, 251, 258, 259, 262, 263, 280, 282, 283, 292, 294, 295, 298, 310, 316, 318, 319, 322, 323, 328, 330, 331, 334, 335, 340, 342, 343, 346, 347, 354, 355, 358, 359]), array(['Anhui', 'Beijing', 'Chongqing', 'Fujian', 'Gansu', 'Guangdong', 'Guangxi', 'Guizhou', 'Hainan', 'Hebei', 'Heilongjiang', 'Henan', 'Hubei', 'Hunan', 'Jiangsu', 'Jiangxi', 'Jilin', 'Ningxia', 'Qinghai', 'Shaanxi', 'Shandong', 'Shanxi', 'Sichuan', 'Tianjin', 'Tibet', 'Xinjiang', 'Yunnan', 'Zhejiang'], dtype=object), array([ 72939., 91405., 96825., 103041., 107046., 107687., 119536., 127819., 133858., 137190., 142650., 144839., 147749., 148812., 160637., 179235., 181409., 195580., 207615., 217707., 223984., 228043., 237948., 248903., 251539., 254002., 262197., 265770., 271297., 271499., 281769., 295133., 319710., 331999., 337894., 340372., 353776., 354775., 359275., 365437., 367568., 369552., 370049., 379186., 391292., 395775., 397517., 430577., 434149., 436189., 447709., 458201., 469514., 472786., 484715., 487155., 510656., 518022., 531994., 558569., 575550., 591088., 601485., 615593., 619201., 642581., 675931., 681676., 684767., 685732., 688887., 714572., 752279., 753552., 761081., 821750., 833423., 833430., 844647., 859482., 875877., 909559., 978069., 985851., 1017303., 1035872., 1097470., 1188989., 1204547., 1207353., 1224179., 1235386., 1246484., 1315102., 1331590., 1358528., 1388043., 1457872., 1550764., 1562694., 1791403., 1890650., 1897575., 1956261., 2018158., 2022957., 2045869., 2121243., 2213991., 2225220., 2482173., 2663667., 2669238., 2922784., 2981235., 3156087., 3847672., 3860764.]), array([ 0., 2990., 8115., 11755., 11767., 17400., 20842., 26300., 27302., 27387., 29646., 30705., 32100., 32119., 32868., 34842., 36670., 36946., 40604., 44623., 45683., 50097., 52108., 53900., 56070., 58533., 59263., 60560., 60906., 62000., 66100., 66529., 68142., 71807., 80361., 80609., 81879., 86256., 88007., 93323., 95648., 100000., 100900., 107658., 108624., 112137., 113000., 114418., 116000., 119658., 123317., 123546., 124582., 124647., 129791., 135765., 143800., 145000., 149549., 150000., 153640., 154364., 165071., 173552., 173556., 178705., 179252., 188633., 197539., 229895., 241282., 260313., 264185., 269596., 279052., 280277., 302600., 309582., 317700., 320627., 321686., 363054., 394795., 400000., 405966., 423049., 429591., 434318., 447900., 460668., 498913., 516342., 527300., 540479., 564400., 570723., 581800., 605400., 655919., 659400., 694400., 763953., 1016400., 1023453., 1046700., 1081000., 1131615., 1187958., 1197400., 1214100., 1239200., 1263500., 1272600., 1329200., 1737800.]), array([2000, 2002, 2003, 2006, 2007]), array([ 117.8 , 162.04, 185.09, 263.68, 290.76, 295.02, 340.65, 341.43, 377.16, 390.2 , 445.36, 725.9 , 797.35, 919.11, 1029.92, 1052.88, 1232.03, 1243.43, 1254.17, 1363.56, 1426.34, 1612.65, 1672.96, 1804. , 1845.72, 1853.65, 1886.35, 2011.19, 2080.04, 2120.35, 2175.68, 2253.39, 2277.35, 2312.82, 2324.8 , 2338.98, 2348.54, 2450.48, 2523.73, 2556.02, 2587.72, 2821.11, 2855.23, 2884.11, 2902.09, 3045.26, 3151.4 , 3161.66, 3519.72, 3523.16, 3545.39, 3551.49, 3620.27, 3637.2 , 3764.54, 3907.23, 3923.11, 3928.2 , 3988.14, 4056.76, 4057.4 , 4151.54, 4212.82, 4275.12, 4315. , 4462.74, 4467.55, 4659.99, 4676.13, 4725.01, 4743.61, 4746.16, 4757.45, 4772.52, 4820.53, 4983.67, 5007.21, 5043.96, 5052.99, 5333.09, 5757.29, 5823.41, 6035.48, 6112.5 , 6211.8 , 6867.7 , 7104. , 7360.92, 7583.85, 7617.47, 7688.67, 7697.82, 8003.67, 8117.78, 8690.24, 9248.53, 9333.4 , 9439.6 , 9456.84, 9705.02, 9846.81, 10275.5 , 10606.85, 10741.25, 12078.15, 12362.79, 13502.42, 13607.32, 15012.46, 15718.47, 15844.64, 18598.69, 18753.73, 21742.05, 21900.19, 25776.91, 26587.76, 31777.01]), array([ 2, 293, 467, 1522, 1534, 1741, 1743, 1899, 1911, 2200, 2418, 2501, 2522, 2954, 3718, 3821, 4521, 4726, 5047, 6121, 6235, 8384, 9384, 10366, 11020, 11169, 12484, 12651, 12812, 21164, 21361, 22472, 24468, 28842, 30086, 30120, 30234, 31000, 31847, 32080, 32180, 33190, 33766, 35511, 36005, 36720, 38375, 39453, 39575, 40463, 41231, 41726, 41856, 43694, 44740, 52466, 53903, 55583, 56403, 66100, 67833, 67923, 68396, 69595, 76064, 90022, 92489, 94368, 101835, 108197, 108534, 112001, 119516, 120819, 139354, 142665, 156886, 168368, 170801, 172464, 184526, 208508, 219126, 241621, 242000, 244853, 259335, 259903, 276622, 280657, 299892, 306162, 307610, 322047, 327051, 343191, 383837, 406058, 413077, 455191, 473404, 498055, 506572, 601617, 607756, 691482, 782294, 888935, 1000069, 1018960, 1036576, 1101159, 1128091, 1133400, 1318339, 1451065, 1712603, 1743140]), array([0. , 0.02702703, 0.03 , 0.03125 , 0.04761905, 0.09677419, 0.20512821, 0.22 , 0.4 , 1.21428571]), array([0. , 0.03 , 0.03571429, 0.10869565, 0.11111111, 0.13 , 0.13888889, 0.15384615, 0.16 , 0.24 , 0.27027027, 0.3 , 0.31 , 0.4 , 0.41025641, 0.4375 , 0.5 , 0.7948718 ]), array([0. , 0.03571429, 0.05128205, 0.12820513, 0.13 , 0.21621622, 0.22222222, 0.23076923, 0.27586207, 0.3 , 0.32432432, 0.4 , 0.40625 , 0.47 , 0.51612903, 0.53 , 0.55 , 0.71052632, 0.8125 ]), array(['1060812', '1082935', '1089674', '1108348', '11537149', '1163113', '11673659', '118013', '1212843', '123888', '1292604', '1310512', '1321004', '1389153', '1443753', '147235', '14740022', '14926380', '1514364', '1514799', '1543658', '1548155', '157652', '1600475', '1601508', '16494981', '1667114', '16753980', '1675757', '1677840', '16804703', '169770', '1710605', '1723026', '1755299', '1762409', '1802055', '1807967', '1841592', '1851377', '1896479', '1913563', '1925862', '1938812', '201412', '2018672', '2024337', '202761', '2110577', '2125369', '2195820', '22377276', '2308652', '2329505', '233299', '2373047', '2419708', '2450874', '2511249', '2523352', '2525301', '2567976', '2648861', '27858007', '2823366', '2823413', '2851375', '2858600', '2972212', '3206892', '3434548', '3444533', '3816261', '3898510', '4032810', '4188265', '4247403', '4404689', '4427000', '4468640', '447643', '4752398', '4830320', '4830392', '4867146', '4958329', '505196', '50819', '5145006', '5596906', '567083', '5903552', '597159', '59841', '6065508', '6166904', '6212824', '6217715', '6879383', '693750', '6994577', '70048', '7071605', '740947', '776120', '7891198', '800312', '830159', '8620804', '8818088', '919235', '924080', '932549', '960708', '966606', '971485', '974325', '9898522'], dtype=object), array(['East China', 'North China', 'Northeast China', 'Northwest China', 'South Central China', 'Southwest China'], dtype=object), array([ 475184, 546541, 632880, 736165, 757990, 819028, 866691, 948521, 976396, 1047698, 1078754, 1109537, 1174622, 1184990, 1210637, 1216605, 1228569, 1258100, 1308445, 1333133, 1364344, 1364980, 1423771, 1426600, 1428990, 1440939, 1472622, 1492835, 1499110, 1554999, 1648826, 1658350, 1742585, 1782317, 1845611, 1873822, 1898911, 1927102, 1962192, 1962633, 1986738, 2017594, 2023674, 2047192, 2052220, 2053980, 2072426, 2135224, 2138158, 2138758, 2143190, 2150325, 2254281, 2261631, 2268499, 2339769, 2347862, 2355164, 2376983, 2378616, 2404936, 2444270, 2455900, 2545841, 2553268, 2649011, 2764053, 2815820, 2867525, 2907301, 2926542, 2939778, 2940367, 2977880, 3035767, 3051103, 3101537, 3114638, 3124234, 3343228, 3388449, 3545004, 3557071, 3586373, 3847158, 3893879, 3923569, 4039036, 4062020, 4073606, 4133488, 4229821, 4390259, 4559252, 4607955, 4613724, 4686125, 4947824, 5046865, 5167300, 5304833, 5502470, 5639838, 6003791, 6033279, 6185600, 6308151, 6349262, 6357869, 6832541, 7040099, 7537692, 7601825, 7646885, 7666512, 7968319, 8340692, 10533312])]
df_train = pd.DataFrame({
"province": ["Zhejiang", "Beijing", "Shanghai"],
"region": ["East China", "North China", "Southwest China"]
})
ohe.fit(df_train)
OneHotEncoder(handle_unknown='ignore', sparse=False)
ohe.categories_
[array(['Beijing', 'Shanghai', 'Zhejiang'], dtype=object), array(['East China', 'North China', 'Southwest China'], dtype=object)]
ohe.transform(df_train)
array([[0., 0., 1., 1., 0., 0.], [1., 0., 0., 0., 1., 0.], [0., 1., 0., 0., 0., 1.]])
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
df
Unnamed: 0 | province | specific | general | year | gdp | fdi | rnr | rr | i | fr | reg | it | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4 | 4 | Anhui | 195580.0 | 32100.0 | 2000 | 2902.09 | 31847 | 0.000000 | 0.000000 | 0.000000 | 1601508 | East China | 1499110 |
6 | 6 | Anhui | 434149.0 | 66529.0 | 2002 | 3519.72 | 38375 | 0.000000 | 0.000000 | 0.000000 | 1677840 | East China | 2404936 |
7 | 7 | Anhui | 619201.0 | 52108.0 | 2003 | 3923.11 | 36720 | 0.000000 | 0.000000 | 0.000000 | 1896479 | East China | 2815820 |
10 | 10 | Anhui | 1457872.0 | 279052.0 | 2006 | 6112.50 | 139354 | 0.000000 | 0.000000 | 0.324324 | 3434548 | East China | 5167300 |
11 | 11 | Anhui | 2213991.0 | 178705.0 | 2007 | 7360.92 | 299892 | 0.000000 | 0.000000 | 0.324324 | 4468640 | East China | 7040099 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
347 | 347 | Yunnan | 2482173.0 | 564400.0 | 2007 | 4772.52 | 39453 | 0.000000 | 0.000000 | 0.000000 | 4867146 | Southwest China | 6832541 |
354 | 354 | Zhejiang | 365437.0 | 321686.0 | 2002 | 8003.67 | 307610 | 0.000000 | 0.000000 | 0.000000 | 4958329 | East China | 1962633 |
355 | 355 | Zhejiang | 391292.0 | 260313.0 | 2003 | 9705.02 | 498055 | 1.214286 | 0.035714 | 0.035714 | 6217715 | East China | 2261631 |
358 | 358 | Zhejiang | 1017303.0 | 394795.0 | 2006 | 15718.47 | 888935 | 1.214286 | 0.035714 | 0.035714 | 11537149 | East China | 2553268 |
359 | 359 | Zhejiang | 844647.0 | 0.0 | 2007 | 18753.73 | 1036576 | 0.047619 | 0.000000 | 0.000000 | 16494981 | East China | 2939778 |
118 rows × 13 columns
ct = ColumnTransformer([
('numerical', StandardScaler(), ['fdi', 'gdp']),
('categorical', OrdinalEncoder(), ['reg'])
])
ct.fit_transform(df)
array([[-0.54088554, -0.48498461, 0. ], [-0.52313911, -0.37867596, 0. ], [-0.52763824, -0.30924305, 0. ], [-0.24862668, 0.06760246, 0. ], [ 0.18779748, 0.28248491, 0. ], [-0.16975183, -0.44030651, 1. ], [-0.15861681, -0.24178957, 1. ], [-0.0317657 , -0.12264395, 1. ], [ 0.60997938, 0.41275831, 1. ], [ 0.74965914, 0.71036505, 1. ], [-0.43826722, -0.31197638, 5. ], [-0.33241116, -0.1796306 , 5. ], [ 0.30550625, -0.33653668, 0. ], [ 0.41600281, -0.21553212, 0. ], [ 0.07908699, -0.12669573, 0. ], [ 0.24802608, 0.3208564 , 0. ], [ 0.47641082, 0.60738699, 0. ], [-0.61051202, -0.80327715, 3. ], [-0.61082193, -0.77244122, 3. ], [-0.61943145, -0.59251706, 3. ], [ 2.43926479, 0.86431921, 4. ], [ 2.45369736, 1.33958151, 4. ], [ 1.49921217, 1.74273266, 4. ], [ 3.31727285, 3.5918763 , 4. ], [ 4.02826654, 4.48506828, 4. ], [-0.48483258, -0.62647875, 4. ], [-0.51402938, -0.55010927, 4. ], [-0.51367597, -0.49892317, 4. ], [-0.50583579, -0.16757679, 4. ], [-0.44152672, 0.01784327, 4. ], [-0.62066294, -0.8072291 , 5. ], [-0.6170745 , -0.77047901, 5. ], [-0.61517155, -0.7389959 , 5. ], [-0.60195143, -0.58190909, 5. ], [-0.59307006, -0.48807939, 5. ], [-0.32298609, -0.7686304 , 4. ], [-0.44281257, -0.11631841, 1. ], [ 0.02938719, 1.35763727, 1. ], [-0.54567284, -0.4420725 , 2. ], [-0.53092492, -0.35845489, 2. ], [-0.53998028, -0.28612859, 2. ], [-0.1631377 , 0.08469433, 2. ], [-0.06063084, 0.23826293, 2. ], [-0.47412981, -0.11476413, 4. ], [-0.51746286, 0.05434551, 4. ], [-0.48092608, 0.19759014, 4. ], [-0.12582615, 1.14342438, 4. ], [ 0.20484254, 1.59949491, 4. ], [-0.37092157, -0.37425755, 4. ], [-0.23962569, -0.25937715, 4. ], [-0.20096576, -0.16563352, 4. ], [ 0.03817341, 0.3266432 , 4. ], [ 0.12453776, 0.62199511, 4. ], [-0.44305724, -0.3732076 , 4. ], [-0.38273622, -0.26992488, 4. ], [-0.35062246, -0.18240867, 4. ], [ 0.07754287, 0.33889839, 4. ], [ 0.2616295 , 0.64027463, 4. ], [ 1.02472886, 0.34047332, 0. ], [ 1.25233884, 0.64324204, 0. ], [ 2.14259107, 0.84118581, 0. ], [ 2.95645589, 2.21676944, 0. ], [ 4.11128168, 2.75781563, 0. ], [-0.54025213, -0.6654458 , 0. ], [-0.5198769 , -0.61001686, 0. ], [-0.3333273 , -0.56271731, 0. ], [ 0.0304175 , -0.28623875, 0. ], [ 0.13550694, -0.15477596, 0. ], [-0.54558041, -0.69654679, 2. ], [-0.53566872, -0.61954045, 2. ], [-0.56094543, -0.58026359, 2. ], [-0.44776842, -0.36136894, 2. ], [-0.42068118, -0.24865385, 2. ], [-0.62272901, -0.93372268, 3. ], [-0.62148121, -0.91958445, 3. ], [-0.62272357, -0.90784563, 3. ], [-0.61735451, -0.8595581 , 3. ], [-0.61374161, -0.82630211, 3. ], [-0.59750395, -0.93911703, 3. ], [-0.61461425, -0.92586868, 3. ], [-0.62060585, -0.91733996, 3. ], [-0.54318812, -0.84725987, 3. ], [-0.54905466, -0.67399173, 3. ], [-0.52958198, -0.59664114, 3. ], [-0.53723458, -0.53909508, 3. ], [-0.37602966, -0.16801571, 3. ], [-0.30255648, 0.00646247, 3. ], [ 0.6594916 , 0.78415268, 0. ], [ 1.00803993, 1.09443114, 0. ], [ 2.0912357 , 2.78503525, 0. ], [ 2.36604988, 3.45230994, 0. ], [-0.56637157, -0.66681073, 1. ], [-0.56992738, -0.5843498 , 1. ], [-0.56939184, -0.49305032, 1. ], [-0.50867935, -0.30836695, 5. ], [-0.47635899, -0.1712172 , 5. ], [-0.51537504, -0.06655233, 5. ], [-0.29901427, 0.51129215, 5. ], [ 0.49549204, -0.21636004, 1. ], [-0.62745649, -0.96422641, 5. ], [-0.62666541, -0.95661166, 5. ], [-0.62619239, -0.95264422, 5. ], [-0.62332436, -0.93445592, 5. ], [-0.62088857, -0.92573443, 5. ], [-0.62226686, -0.74980181, 3. ], [-0.62229948, -0.70692756, 3. ], [-0.62329174, -0.65981736, 3. ], [-0.59928186, -0.46034169, 3. ], [-0.59352405, -0.37808386, 3. ], [-0.59263238, -0.63832946, 5. ], [-0.59709889, -0.58641184, 5. ], [-0.60466994, -0.5445514 , 5. ], [-0.5452705 , -0.29804986, 5. ], [-0.52020855, -0.16303961, 5. ], [ 0.20877895, 0.3931173 , 0. ], [ 0.72650559, 0.68595965, 0. ], [ 1.7891168 , 1.72101584, 0. ], [ 2.19048034, 2.24345547, 0. ]])
ct = ColumnTransformer([
('numerical', StandardScaler(), ['fdi', 'gdp']),
('categorical', OneHotEncoder(), ['reg'])
])
ct.fit_transform(df)
array([[-0.54088554, -0.48498461, 1. , 0. , 0. , 0. , 0. , 0. ], [-0.52313911, -0.37867596, 1. , 0. , 0. , 0. , 0. , 0. ], [-0.52763824, -0.30924305, 1. , 0. , 0. , 0. , 0. , 0. ], [-0.24862668, 0.06760246, 1. , 0. , 0. , 0. , 0. , 0. ], [ 0.18779748, 0.28248491, 1. , 0. , 0. , 0. , 0. , 0. ], [-0.16975183, -0.44030651, 0. , 1. , 0. , 0. , 0. , 0. ], [-0.15861681, -0.24178957, 0. , 1. , 0. , 0. , 0. , 0. ], [-0.0317657 , -0.12264395, 0. , 1. , 0. , 0. , 0. , 0. ], [ 0.60997938, 0.41275831, 0. , 1. , 0. , 0. , 0. , 0. ], [ 0.74965914, 0.71036505, 0. , 1. , 0. , 0. , 0. , 0. ], [-0.43826722, -0.31197638, 0. , 0. , 0. , 0. , 0. , 1. ], [-0.33241116, -0.1796306 , 0. , 0. , 0. , 0. , 0. , 1. ], [ 0.30550625, -0.33653668, 1. , 0. , 0. , 0. , 0. , 0. ], [ 0.41600281, -0.21553212, 1. , 0. , 0. , 0. , 0. , 0. ], [ 0.07908699, -0.12669573, 1. , 0. , 0. , 0. , 0. , 0. ], [ 0.24802608, 0.3208564 , 1. , 0. , 0. , 0. , 0. , 0. ], [ 0.47641082, 0.60738699, 1. , 0. , 0. , 0. , 0. , 0. ], [-0.61051202, -0.80327715, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.61082193, -0.77244122, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.61943145, -0.59251706, 0. , 0. , 0. , 1. , 0. , 0. ], [ 2.43926479, 0.86431921, 0. , 0. , 0. , 0. , 1. , 0. ], [ 2.45369736, 1.33958151, 0. , 0. , 0. , 0. , 1. , 0. ], [ 1.49921217, 1.74273266, 0. , 0. , 0. , 0. , 1. , 0. ], [ 3.31727285, 3.5918763 , 0. , 0. , 0. , 0. , 1. , 0. ], [ 4.02826654, 4.48506828, 0. , 0. , 0. , 0. , 1. , 0. ], [-0.48483258, -0.62647875, 0. , 0. , 0. , 0. , 1. , 0. ], [-0.51402938, -0.55010927, 0. , 0. , 0. , 0. , 1. , 0. ], [-0.51367597, -0.49892317, 0. , 0. , 0. , 0. , 1. , 0. ], [-0.50583579, -0.16757679, 0. , 0. , 0. , 0. , 1. , 0. ], [-0.44152672, 0.01784327, 0. , 0. , 0. , 0. , 1. , 0. ], [-0.62066294, -0.8072291 , 0. , 0. , 0. , 0. , 0. , 1. ], [-0.6170745 , -0.77047901, 0. , 0. , 0. , 0. , 0. , 1. ], [-0.61517155, -0.7389959 , 0. , 0. , 0. , 0. , 0. , 1. ], [-0.60195143, -0.58190909, 0. , 0. , 0. , 0. , 0. , 1. ], [-0.59307006, -0.48807939, 0. , 0. , 0. , 0. , 0. , 1. ], [-0.32298609, -0.7686304 , 0. , 0. , 0. , 0. , 1. , 0. ], [-0.44281257, -0.11631841, 0. , 1. , 0. , 0. , 0. , 0. ], [ 0.02938719, 1.35763727, 0. , 1. , 0. , 0. , 0. , 0. ], [-0.54567284, -0.4420725 , 0. , 0. , 1. , 0. , 0. , 0. ], [-0.53092492, -0.35845489, 0. , 0. , 1. , 0. , 0. , 0. ], [-0.53998028, -0.28612859, 0. , 0. , 1. , 0. , 0. , 0. ], [-0.1631377 , 0.08469433, 0. , 0. , 1. , 0. , 0. , 0. ], [-0.06063084, 0.23826293, 0. , 0. , 1. , 0. , 0. , 0. ], [-0.47412981, -0.11476413, 0. , 0. , 0. , 0. , 1. , 0. ], [-0.51746286, 0.05434551, 0. , 0. , 0. , 0. , 1. , 0. ], [-0.48092608, 0.19759014, 0. , 0. , 0. , 0. , 1. , 0. ], [-0.12582615, 1.14342438, 0. , 0. , 0. , 0. , 1. , 0. ], [ 0.20484254, 1.59949491, 0. , 0. , 0. , 0. , 1. , 0. ], [-0.37092157, -0.37425755, 0. , 0. , 0. , 0. , 1. , 0. ], [-0.23962569, -0.25937715, 0. , 0. , 0. , 0. , 1. , 0. ], [-0.20096576, -0.16563352, 0. , 0. , 0. , 0. , 1. , 0. ], [ 0.03817341, 0.3266432 , 0. , 0. , 0. , 0. , 1. , 0. ], [ 0.12453776, 0.62199511, 0. , 0. , 0. , 0. , 1. , 0. ], [-0.44305724, -0.3732076 , 0. , 0. , 0. , 0. , 1. , 0. ], [-0.38273622, -0.26992488, 0. , 0. , 0. , 0. , 1. , 0. ], [-0.35062246, -0.18240867, 0. , 0. , 0. , 0. , 1. , 0. ], [ 0.07754287, 0.33889839, 0. , 0. , 0. , 0. , 1. , 0. ], [ 0.2616295 , 0.64027463, 0. , 0. , 0. , 0. , 1. , 0. ], [ 1.02472886, 0.34047332, 1. , 0. , 0. , 0. , 0. , 0. ], [ 1.25233884, 0.64324204, 1. , 0. , 0. , 0. , 0. , 0. ], [ 2.14259107, 0.84118581, 1. , 0. , 0. , 0. , 0. , 0. ], [ 2.95645589, 2.21676944, 1. , 0. , 0. , 0. , 0. , 0. ], [ 4.11128168, 2.75781563, 1. , 0. , 0. , 0. , 0. , 0. ], [-0.54025213, -0.6654458 , 1. , 0. , 0. , 0. , 0. , 0. ], [-0.5198769 , -0.61001686, 1. , 0. , 0. , 0. , 0. , 0. ], [-0.3333273 , -0.56271731, 1. , 0. , 0. , 0. , 0. , 0. ], [ 0.0304175 , -0.28623875, 1. , 0. , 0. , 0. , 0. , 0. ], [ 0.13550694, -0.15477596, 1. , 0. , 0. , 0. , 0. , 0. ], [-0.54558041, -0.69654679, 0. , 0. , 1. , 0. , 0. , 0. ], [-0.53566872, -0.61954045, 0. , 0. , 1. , 0. , 0. , 0. ], [-0.56094543, -0.58026359, 0. , 0. , 1. , 0. , 0. , 0. ], [-0.44776842, -0.36136894, 0. , 0. , 1. , 0. , 0. , 0. ], [-0.42068118, -0.24865385, 0. , 0. , 1. , 0. , 0. , 0. ], [-0.62272901, -0.93372268, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.62148121, -0.91958445, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.62272357, -0.90784563, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.61735451, -0.8595581 , 0. , 0. , 0. , 1. , 0. , 0. ], [-0.61374161, -0.82630211, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.59750395, -0.93911703, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.61461425, -0.92586868, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.62060585, -0.91733996, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.54318812, -0.84725987, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.54905466, -0.67399173, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.52958198, -0.59664114, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.53723458, -0.53909508, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.37602966, -0.16801571, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.30255648, 0.00646247, 0. , 0. , 0. , 1. , 0. , 0. ], [ 0.6594916 , 0.78415268, 1. , 0. , 0. , 0. , 0. , 0. ], [ 1.00803993, 1.09443114, 1. , 0. , 0. , 0. , 0. , 0. ], [ 2.0912357 , 2.78503525, 1. , 0. , 0. , 0. , 0. , 0. ], [ 2.36604988, 3.45230994, 1. , 0. , 0. , 0. , 0. , 0. ], [-0.56637157, -0.66681073, 0. , 1. , 0. , 0. , 0. , 0. ], [-0.56992738, -0.5843498 , 0. , 1. , 0. , 0. , 0. , 0. ], [-0.56939184, -0.49305032, 0. , 1. , 0. , 0. , 0. , 0. ], [-0.50867935, -0.30836695, 0. , 0. , 0. , 0. , 0. , 1. ], [-0.47635899, -0.1712172 , 0. , 0. , 0. , 0. , 0. , 1. ], [-0.51537504, -0.06655233, 0. , 0. , 0. , 0. , 0. , 1. ], [-0.29901427, 0.51129215, 0. , 0. , 0. , 0. , 0. , 1. ], [ 0.49549204, -0.21636004, 0. , 1. , 0. , 0. , 0. , 0. ], [-0.62745649, -0.96422641, 0. , 0. , 0. , 0. , 0. , 1. ], [-0.62666541, -0.95661166, 0. , 0. , 0. , 0. , 0. , 1. ], [-0.62619239, -0.95264422, 0. , 0. , 0. , 0. , 0. , 1. ], [-0.62332436, -0.93445592, 0. , 0. , 0. , 0. , 0. , 1. ], [-0.62088857, -0.92573443, 0. , 0. , 0. , 0. , 0. , 1. ], [-0.62226686, -0.74980181, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.62229948, -0.70692756, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.62329174, -0.65981736, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.59928186, -0.46034169, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.59352405, -0.37808386, 0. , 0. , 0. , 1. , 0. , 0. ], [-0.59263238, -0.63832946, 0. , 0. , 0. , 0. , 0. , 1. ], [-0.59709889, -0.58641184, 0. , 0. , 0. , 0. , 0. , 1. ], [-0.60466994, -0.5445514 , 0. , 0. , 0. , 0. , 0. , 1. ], [-0.5452705 , -0.29804986, 0. , 0. , 0. , 0. , 0. , 1. ], [-0.52020855, -0.16303961, 0. , 0. , 0. , 0. , 0. , 1. ], [ 0.20877895, 0.3931173 , 1. , 0. , 0. , 0. , 0. , 0. ], [ 0.72650559, 0.68595965, 1. , 0. , 0. , 0. , 0. , 0. ], [ 1.7891168 , 1.72101584, 1. , 0. , 0. , 0. , 0. , 0. ], [ 2.19048034, 2.24345547, 1. , 0. , 0. , 0. , 0. , 0. ]])
# df
y
4 195580.0 6 434149.0 7 619201.0 10 1457872.0 11 2213991.0 ... 347 2482173.0 354 365437.0 355 391292.0 358 1017303.0 359 844647.0 Name: specific, Length: 118, dtype: float64
X.head()
province | general | year | gdp | fdi | rnr | rr | i | fr | reg | it | |
---|---|---|---|---|---|---|---|---|---|---|---|
4 | Anhui | 32100.0 | 2000 | 2902.09 | 31847 | 0.0 | 0.0 | 0.000000 | 1601508 | East China | 1499110 |
6 | Anhui | 66529.0 | 2002 | 3519.72 | 38375 | 0.0 | 0.0 | 0.000000 | 1677840 | East China | 2404936 |
7 | Anhui | 52108.0 | 2003 | 3923.11 | 36720 | 0.0 | 0.0 | 0.000000 | 1896479 | East China | 2815820 |
10 | Anhui | 279052.0 | 2006 | 6112.50 | 139354 | 0.0 | 0.0 | 0.324324 | 3434548 | East China | 5167300 |
11 | Anhui | 178705.0 | 2007 | 7360.92 | 299892 | 0.0 | 0.0 | 0.324324 | 4468640 | East China | 7040099 |
X.dtypes
province object general float64 year int64 gdp float64 fdi int64 rnr float64 rr float64 i float64 fr object reg object it int64 dtype: object
missing_values = pd.concat({"na_cnt": X.isna().sum(), "dtypes": X.dtypes}, axis='columns')
missing_values
na_cnt | dtypes | |
---|---|---|
province | 0 | object |
general | 0 | float64 |
year | 0 | int64 |
gdp | 0 | float64 |
fdi | 0 | int64 |
rnr | 0 | float64 |
rr | 0 | float64 |
i | 0 | float64 |
fr | 0 | object |
reg | 0 | object |
it | 0 | int64 |
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=42)
missing_values
na_cnt | dtypes | |
---|---|---|
province | 0 | object |
general | 0 | float64 |
year | 0 | int64 |
gdp | 0 | float64 |
fdi | 0 | int64 |
rnr | 0 | float64 |
rr | 0 | float64 |
i | 0 | float64 |
fr | 0 | object |
reg | 0 | object |
it | 0 | int64 |
X
province | general | year | gdp | fdi | rnr | rr | i | fr | reg | it | |
---|---|---|---|---|---|---|---|---|---|---|---|
4 | Anhui | 32100.0 | 2000 | 2902.09 | 31847 | 0.000000 | 0.000000 | 0.000000 | 1601508 | East China | 1499110 |
6 | Anhui | 66529.0 | 2002 | 3519.72 | 38375 | 0.000000 | 0.000000 | 0.000000 | 1677840 | East China | 2404936 |
7 | Anhui | 52108.0 | 2003 | 3923.11 | 36720 | 0.000000 | 0.000000 | 0.000000 | 1896479 | East China | 2815820 |
10 | Anhui | 279052.0 | 2006 | 6112.50 | 139354 | 0.000000 | 0.000000 | 0.324324 | 3434548 | East China | 5167300 |
11 | Anhui | 178705.0 | 2007 | 7360.92 | 299892 | 0.000000 | 0.000000 | 0.324324 | 4468640 | East China | 7040099 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
347 | Yunnan | 564400.0 | 2007 | 4772.52 | 39453 | 0.000000 | 0.000000 | 0.000000 | 4867146 | Southwest China | 6832541 |
354 | Zhejiang | 321686.0 | 2002 | 8003.67 | 307610 | 0.000000 | 0.000000 | 0.000000 | 4958329 | East China | 1962633 |
355 | Zhejiang | 260313.0 | 2003 | 9705.02 | 498055 | 1.214286 | 0.035714 | 0.035714 | 6217715 | East China | 2261631 |
358 | Zhejiang | 394795.0 | 2006 | 15718.47 | 888935 | 1.214286 | 0.035714 | 0.035714 | 11537149 | East China | 2553268 |
359 | Zhejiang | 0.0 | 2007 | 18753.73 | 1036576 | 0.047619 | 0.000000 | 0.000000 | 16494981 | East China | 2939778 |
118 rows × 11 columns
numerical_features = ['general', 'gdp', 'fdi', 'i', 'rr']
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
num_prep = Pipeline([
('imputer', SimpleImputer()),
('scaler', StandardScaler())
])
num_prep
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())])
SimpleImputer()
StandardScaler()
num_trans = num_prep.fit_transform(X_train[numerical_features])
num_trans
array([[ 0.84445696, -0.23824571, 0.05133332, -0.46886997, 1.84019633], [ 0.24766853, 2.07395803, 1.57444985, -0.46886997, -0.43195321], [-0.72214407, -0.0446055 , -0.43939868, -0.46886997, -0.43195321], [-0.48856958, -0.11992158, -0.34379901, -0.46886997, -0.43195321], [ 0.85337791, -0.87758205, -0.54348651, -0.46886997, -0.43195321], [-0.62992082, -0.83196322, -0.62382665, -0.46886997, 0.17004293], [-0.83650183, 3.23074054, 4.28312405, 1.06326271, -0.43195321], [ 1.76828095, -0.19541419, -0.41644878, -0.46886997, -0.43195321], [ 1.78635605, 0.89747804, 0.79717537, -0.46886997, 3.97033652], [-0.01481382, -0.57518987, -0.60442312, 3.47737968, -0.43195321], [-0.33025804, -0.71766019, -0.62552372, -0.46886997, -0.43195321], [-0.75781248, -0.70583032, -0.54596728, -0.46886997, -0.43195321], [ 0.18859962, -0.10301925, -0.50475274, -0.46886997, 1.78339259], [ 3.61705142, 1.91072514, 0.2322093 , -0.46886997, -0.43195321], [-0.52047074, -0.52639929, -0.53731279, -0.46886997, -0.43195321], [-0.35308192, -0.41382042, -0.15623907, 2.47473995, -0.43195321], [ 0.51483816, -0.85369871, -0.61664935, -0.46886997, -0.43195321], [ 2.42485843, -0.08843151, 0.16030939, -0.46886997, 1.84019633], [-0.63055638, -0.3205428 , -0.53076977, -0.46886997, -0.43195321], [-0.81570508, -0.2076344 , -0.22869714, 0.25314756, -0.43195321], [-0.06101258, -0.78797637, -0.31514053, -0.46886997, -0.43195321], [-0.022315 , 0.78012485, 0.51382125, -0.46886997, -0.43195321], [-0.37712312, -0.55331904, -0.3258642 , 2.14150109, -0.43195321], [ 2.33926246, -0.46826211, -0.59521328, 3.47737968, -0.43195321], [ 0.62612061, -0.10351944, -0.37014586, -0.46886997, -0.43195321], [-0.1455937 , -0.05181405, -0.01314966, -0.46886997, 3.97033652], [-0.69280823, -0.04283426, -0.47187415, -0.46886997, -0.43195321], [-0.15946076, -0.25170559, -0.5456459 , -0.46886997, -0.43195321], [-0.80637666, -0.98226192, -0.59981116, -0.46886997, -0.43195321], [-0.45208878, -0.48061962, -0.51288289, -0.46886997, 1.78339259], [ 0.4867555 , -0.58727866, -0.62254962, 0.24317887, -0.43195321], [-0.74721037, -0.26346288, -0.50770147, -0.46886997, -0.26580227], [ 0.71498921, -0.96701118, -0.62406064, 0.76534802, -0.43195321], [-0.6776111 , -0.6259812 , -0.48297275, 1.75272242, -0.43195321], [ 0.20388901, 0.55832718, 0.6523297 , -0.46886997, 3.97033652], [-0.68462536, -0.15766887, 0.45117911, -0.46886997, 1.22955614], [-0.68130147, -0.59197843, -0.52937716, -0.46886997, -0.43195321], [-0.1213603 , 0.16498962, -0.23803102, 1.33242116, -0.43195321], [-0.68649617, -0.8274596 , -0.61330032, -0.46886997, 0.42010287], [-0.73244378, -0.41583293, -0.54606312, -0.46886997, -0.43195321], [-0.75418874, -0.67194329, -0.56752738, -0.46886997, -0.43195321], [-0.11822093, -0.18759169, -0.14469223, 2.47473995, -0.43195321], [-0.5606009 , -0.89159703, -0.62039587, -0.46886997, -0.43195321], [-0.70811556, -0.99767742, -0.6295606 , 1.33242116, -0.43195321], [-0.44276036, -0.47392695, -0.57065935, -0.46886997, -0.43195321], [-0.61544895, -0.21965453, -0.37710046, -0.46886997, -0.43195321], [-0.57791999, -0.10080471, -0.18860741, -0.46886997, 0.18342062], [ 2.23213936, -0.34291185, -0.59568406, -0.46886997, -0.43195321], [ 2.06354861, 1.39098988, -0.11068883, -0.46886997, -0.43195321], [-0.79190991, -0.9574449 , -0.62376745, 0.24317887, -0.43195321], [ 0.26443406, 2.61416753, 3.08558822, 1.06326271, -0.43195321], [-0.01209987, 0.53594439, 0.23629128, -0.46886997, -0.43195321], [-0.80634591, -0.96716418, -0.61755426, -0.46886997, -0.43195321], [-0.16938374, 0.8696658 , 0.77316551, -0.27051351, -0.23415447], [-0.04311944, 0.18446741, -0.14938031, 0.73199078, -0.43195321], [-0.68041476, 0.47595163, 1.08241827, -0.18405044, -0.43195321], [-0.76631569, -0.33855142, -0.36484886, -0.46886997, 1.06490206], [-0.71942755, -1.0021987 , -0.63005112, -0.46886997, 1.28493979], [ 2.56991029, 0.81760341, 0.29109646, -0.46886997, 1.99108126], [-0.74181833, -0.94662522, -0.62596349, -0.46886997, -0.43195321], [-0.39172056, -0.75420507, -0.61813217, 0.73199078, -0.43195321], [-0.74252565, -0.58032118, -0.59939112, -0.46886997, -0.43195321], [-0.52984786, 0.82098506, 1.31844616, -0.46886997, -0.43195321], [-0.75226924, -0.67038781, -0.54044193, -0.46886997, -0.43195321], [-0.51722886, -0.15861235, 0.53360817, -0.46886997, -0.43195321], [ 0.94307429, 1.63510561, 0.0502649 , -0.46886997, 2.33722904], [ 2.27493734, 0.10828435, -0.43806527, 0.81281795, -0.43195321], [-0.62666612, -0.76651942, -0.62548989, -0.46886997, -0.43195321], [-0.66600439, -0.34358661, -0.52269601, -0.46886997, -0.43195321], [ 2.40153737, 0.79677218, 0.14893452, -0.46886997, 0.18342062], [ 0.09391613, 1.61452938, 2.56423569, -0.46886997, -0.43195321], [-0.70296186, -0.26446128, -0.52736154, -0.46886997, -0.43195321], [ 0.44208927, 4.18123112, 3.45974963, -0.46886997, -0.43195321], [-0.78308892, -1.01087642, -0.63087146, -0.46886997, 0.89725427], [-0.54691067, 0.98156598, 0.70367308, -0.46886997, -0.43195321], [ 0.60991632, -0.09784872, -0.51965707, -0.46886997, -0.43195321], [-0.76052643, -0.96000271, -0.62467519, -0.46886997, 0.33726408], [ 1.93383484, 0.09531485, -0.29395536, 4.04373958, -0.43195321], [-0.76653353, -0.6801267 , -0.54957002, -0.46886997, -0.43195321], [-0.83650183, 2.64457877, 2.29128386, -0.46886997, -0.43195321], [-0.51706228, 1.07292328, 2.54926935, -0.46886997, -0.43195321], [-0.39173081, 1.0465606 , 2.24162345, -0.46886997, -0.43195321], [ 0.54861269, -0.32386364, -0.44453781, -0.46886997, -0.43195321], [-0.41346552, -0.66397367, -0.62655268, -0.46886997, -0.43195321], [-0.66187067, -0.05643144, 0.1018028 , 1.19732432, -0.43195321], [-0.61096157, -0.57797129, -0.5712147 , -0.46886997, -0.43195321], [ 0.34407676, 0.4601909 , 0.05937608, -0.46886997, 0.18342062], [-0.46490255, -0.97695015, -0.62658651, 1.7874348 , -0.43195321]])
num_trans.shape
(88, 5)
X
province | general | year | gdp | fdi | rnr | rr | i | fr | reg | it | |
---|---|---|---|---|---|---|---|---|---|---|---|
4 | Anhui | 32100.0 | 2000 | 2902.09 | 31847 | 0.000000 | 0.000000 | 0.000000 | 1601508 | East China | 1499110 |
6 | Anhui | 66529.0 | 2002 | 3519.72 | 38375 | 0.000000 | 0.000000 | 0.000000 | 1677840 | East China | 2404936 |
7 | Anhui | 52108.0 | 2003 | 3923.11 | 36720 | 0.000000 | 0.000000 | 0.000000 | 1896479 | East China | 2815820 |
10 | Anhui | 279052.0 | 2006 | 6112.50 | 139354 | 0.000000 | 0.000000 | 0.324324 | 3434548 | East China | 5167300 |
11 | Anhui | 178705.0 | 2007 | 7360.92 | 299892 | 0.000000 | 0.000000 | 0.324324 | 4468640 | East China | 7040099 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
347 | Yunnan | 564400.0 | 2007 | 4772.52 | 39453 | 0.000000 | 0.000000 | 0.000000 | 4867146 | Southwest China | 6832541 |
354 | Zhejiang | 321686.0 | 2002 | 8003.67 | 307610 | 0.000000 | 0.000000 | 0.000000 | 4958329 | East China | 1962633 |
355 | Zhejiang | 260313.0 | 2003 | 9705.02 | 498055 | 1.214286 | 0.035714 | 0.035714 | 6217715 | East China | 2261631 |
358 | Zhejiang | 394795.0 | 2006 | 15718.47 | 888935 | 1.214286 | 0.035714 | 0.035714 | 11537149 | East China | 2553268 |
359 | Zhejiang | 0.0 | 2007 | 18753.73 | 1036576 | 0.047619 | 0.000000 | 0.000000 | 16494981 | East China | 2939778 |
118 rows × 11 columns
categorical_features = ['province', 'reg']
cat_prep = Pipeline([
('imputer', SimpleImputer(strategy='constant', fill_value='sk_missing')),
('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])
cat_prep
Pipeline(steps=[('imputer', SimpleImputer(fill_value='sk_missing', strategy='constant')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))])
SimpleImputer(fill_value='sk_missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
cat_trans = cat_prep.fit_transform(X_train[categorical_features])
cat_trans
array([[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 1., 0.], [0., 0., 0., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 1., 0.], [0., 0., 0., ..., 0., 0., 1.]])
cat_trans.shape
(88, 33)
ct = ColumnTransformer([
('numerical', num_prep, numerical_features),
('categorical', cat_prep, categorical_features)
])
ct
ColumnTransformer(transformers=[('numerical', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())]), ['general', 'gdp', 'fdi', 'i', 'rr']), ('categorical', Pipeline(steps=[('imputer', SimpleImputer(fill_value='sk_missing', strategy='constant')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))]), ['province', 'reg'])])
['general', 'gdp', 'fdi', 'i', 'rr']
SimpleImputer()
StandardScaler()
['province', 'reg']
SimpleImputer(fill_value='sk_missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
X_trans = ct.fit_transform(X_train)
X_trans[:, :5]
array([[ 0.84445696, -0.23824571, 0.05133332, -0.46886997, 1.84019633], [ 0.24766853, 2.07395803, 1.57444985, -0.46886997, -0.43195321], [-0.72214407, -0.0446055 , -0.43939868, -0.46886997, -0.43195321], [-0.48856958, -0.11992158, -0.34379901, -0.46886997, -0.43195321], [ 0.85337791, -0.87758205, -0.54348651, -0.46886997, -0.43195321], [-0.62992082, -0.83196322, -0.62382665, -0.46886997, 0.17004293], [-0.83650183, 3.23074054, 4.28312405, 1.06326271, -0.43195321], [ 1.76828095, -0.19541419, -0.41644878, -0.46886997, -0.43195321], [ 1.78635605, 0.89747804, 0.79717537, -0.46886997, 3.97033652], [-0.01481382, -0.57518987, -0.60442312, 3.47737968, -0.43195321], [-0.33025804, -0.71766019, -0.62552372, -0.46886997, -0.43195321], [-0.75781248, -0.70583032, -0.54596728, -0.46886997, -0.43195321], [ 0.18859962, -0.10301925, -0.50475274, -0.46886997, 1.78339259], [ 3.61705142, 1.91072514, 0.2322093 , -0.46886997, -0.43195321], [-0.52047074, -0.52639929, -0.53731279, -0.46886997, -0.43195321], [-0.35308192, -0.41382042, -0.15623907, 2.47473995, -0.43195321], [ 0.51483816, -0.85369871, -0.61664935, -0.46886997, -0.43195321], [ 2.42485843, -0.08843151, 0.16030939, -0.46886997, 1.84019633], [-0.63055638, -0.3205428 , -0.53076977, -0.46886997, -0.43195321], [-0.81570508, -0.2076344 , -0.22869714, 0.25314756, -0.43195321], [-0.06101258, -0.78797637, -0.31514053, -0.46886997, -0.43195321], [-0.022315 , 0.78012485, 0.51382125, -0.46886997, -0.43195321], [-0.37712312, -0.55331904, -0.3258642 , 2.14150109, -0.43195321], [ 2.33926246, -0.46826211, -0.59521328, 3.47737968, -0.43195321], [ 0.62612061, -0.10351944, -0.37014586, -0.46886997, -0.43195321], [-0.1455937 , -0.05181405, -0.01314966, -0.46886997, 3.97033652], [-0.69280823, -0.04283426, -0.47187415, -0.46886997, -0.43195321], [-0.15946076, -0.25170559, -0.5456459 , -0.46886997, -0.43195321], [-0.80637666, -0.98226192, -0.59981116, -0.46886997, -0.43195321], [-0.45208878, -0.48061962, -0.51288289, -0.46886997, 1.78339259], [ 0.4867555 , -0.58727866, -0.62254962, 0.24317887, -0.43195321], [-0.74721037, -0.26346288, -0.50770147, -0.46886997, -0.26580227], [ 0.71498921, -0.96701118, -0.62406064, 0.76534802, -0.43195321], [-0.6776111 , -0.6259812 , -0.48297275, 1.75272242, -0.43195321], [ 0.20388901, 0.55832718, 0.6523297 , -0.46886997, 3.97033652], [-0.68462536, -0.15766887, 0.45117911, -0.46886997, 1.22955614], [-0.68130147, -0.59197843, -0.52937716, -0.46886997, -0.43195321], [-0.1213603 , 0.16498962, -0.23803102, 1.33242116, -0.43195321], [-0.68649617, -0.8274596 , -0.61330032, -0.46886997, 0.42010287], [-0.73244378, -0.41583293, -0.54606312, -0.46886997, -0.43195321], [-0.75418874, -0.67194329, -0.56752738, -0.46886997, -0.43195321], [-0.11822093, -0.18759169, -0.14469223, 2.47473995, -0.43195321], [-0.5606009 , -0.89159703, -0.62039587, -0.46886997, -0.43195321], [-0.70811556, -0.99767742, -0.6295606 , 1.33242116, -0.43195321], [-0.44276036, -0.47392695, -0.57065935, -0.46886997, -0.43195321], [-0.61544895, -0.21965453, -0.37710046, -0.46886997, -0.43195321], [-0.57791999, -0.10080471, -0.18860741, -0.46886997, 0.18342062], [ 2.23213936, -0.34291185, -0.59568406, -0.46886997, -0.43195321], [ 2.06354861, 1.39098988, -0.11068883, -0.46886997, -0.43195321], [-0.79190991, -0.9574449 , -0.62376745, 0.24317887, -0.43195321], [ 0.26443406, 2.61416753, 3.08558822, 1.06326271, -0.43195321], [-0.01209987, 0.53594439, 0.23629128, -0.46886997, -0.43195321], [-0.80634591, -0.96716418, -0.61755426, -0.46886997, -0.43195321], [-0.16938374, 0.8696658 , 0.77316551, -0.27051351, -0.23415447], [-0.04311944, 0.18446741, -0.14938031, 0.73199078, -0.43195321], [-0.68041476, 0.47595163, 1.08241827, -0.18405044, -0.43195321], [-0.76631569, -0.33855142, -0.36484886, -0.46886997, 1.06490206], [-0.71942755, -1.0021987 , -0.63005112, -0.46886997, 1.28493979], [ 2.56991029, 0.81760341, 0.29109646, -0.46886997, 1.99108126], [-0.74181833, -0.94662522, -0.62596349, -0.46886997, -0.43195321], [-0.39172056, -0.75420507, -0.61813217, 0.73199078, -0.43195321], [-0.74252565, -0.58032118, -0.59939112, -0.46886997, -0.43195321], [-0.52984786, 0.82098506, 1.31844616, -0.46886997, -0.43195321], [-0.75226924, -0.67038781, -0.54044193, -0.46886997, -0.43195321], [-0.51722886, -0.15861235, 0.53360817, -0.46886997, -0.43195321], [ 0.94307429, 1.63510561, 0.0502649 , -0.46886997, 2.33722904], [ 2.27493734, 0.10828435, -0.43806527, 0.81281795, -0.43195321], [-0.62666612, -0.76651942, -0.62548989, -0.46886997, -0.43195321], [-0.66600439, -0.34358661, -0.52269601, -0.46886997, -0.43195321], [ 2.40153737, 0.79677218, 0.14893452, -0.46886997, 0.18342062], [ 0.09391613, 1.61452938, 2.56423569, -0.46886997, -0.43195321], [-0.70296186, -0.26446128, -0.52736154, -0.46886997, -0.43195321], [ 0.44208927, 4.18123112, 3.45974963, -0.46886997, -0.43195321], [-0.78308892, -1.01087642, -0.63087146, -0.46886997, 0.89725427], [-0.54691067, 0.98156598, 0.70367308, -0.46886997, -0.43195321], [ 0.60991632, -0.09784872, -0.51965707, -0.46886997, -0.43195321], [-0.76052643, -0.96000271, -0.62467519, -0.46886997, 0.33726408], [ 1.93383484, 0.09531485, -0.29395536, 4.04373958, -0.43195321], [-0.76653353, -0.6801267 , -0.54957002, -0.46886997, -0.43195321], [-0.83650183, 2.64457877, 2.29128386, -0.46886997, -0.43195321], [-0.51706228, 1.07292328, 2.54926935, -0.46886997, -0.43195321], [-0.39173081, 1.0465606 , 2.24162345, -0.46886997, -0.43195321], [ 0.54861269, -0.32386364, -0.44453781, -0.46886997, -0.43195321], [-0.41346552, -0.66397367, -0.62655268, -0.46886997, -0.43195321], [-0.66187067, -0.05643144, 0.1018028 , 1.19732432, -0.43195321], [-0.61096157, -0.57797129, -0.5712147 , -0.46886997, -0.43195321], [ 0.34407676, 0.4601909 , 0.05937608, -0.46886997, 0.18342062], [-0.46490255, -0.97695015, -0.62658651, 1.7874348 , -0.43195321]])
X_trans[:, 5:]
array([[0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 1., 0.], [0., 0., 0., ..., 0., 0., 0.], ..., [0., 0., 0., ..., 0., 0., 0.], [0., 0., 0., ..., 0., 1., 0.], [0., 0., 0., ..., 0., 0., 1.]])
X_trans.shape
(88, 38)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
lin_reg = Pipeline([
('preprocess', ct),
('lin_reg', LinearRegression())
])
lin_reg
Pipeline(steps=[('preprocess', ColumnTransformer(transformers=[('numerical', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())]), ['general', 'gdp', 'fdi', 'i', 'rr']), ('categorical', Pipeline(steps=[('imputer', SimpleImputer(fill_value='sk_missing', strategy='constant')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))]), ['province', 'reg'])])), ('lin_reg', LinearRegression())])
ColumnTransformer(transformers=[('numerical', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())]), ['general', 'gdp', 'fdi', 'i', 'rr']), ('categorical', Pipeline(steps=[('imputer', SimpleImputer(fill_value='sk_missing', strategy='constant')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))]), ['province', 'reg'])])
['general', 'gdp', 'fdi', 'i', 'rr']
SimpleImputer()
StandardScaler()
['province', 'reg']
SimpleImputer(fill_value='sk_missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
LinearRegression()
lin_reg.fit(X_train, y_train)
Pipeline(steps=[('preprocess', ColumnTransformer(transformers=[('numerical', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())]), ['general', 'gdp', 'fdi', 'i', 'rr']), ('categorical', Pipeline(steps=[('imputer', SimpleImputer(fill_value='sk_missing', strategy='constant')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))]), ['province', 'reg'])])), ('lin_reg', LinearRegression())])
ColumnTransformer(transformers=[('numerical', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())]), ['general', 'gdp', 'fdi', 'i', 'rr']), ('categorical', Pipeline(steps=[('imputer', SimpleImputer(fill_value='sk_missing', strategy='constant')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))]), ['province', 'reg'])])
['general', 'gdp', 'fdi', 'i', 'rr']
SimpleImputer()
StandardScaler()
['province', 'reg']
SimpleImputer(fill_value='sk_missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
LinearRegression()
lin_reg.score(X_train, y_train)
0.9195619190476331
from sklearn.ensemble import RandomForestRegressor
rf = Pipeline([
('preprocess', ct),
('log_reg', RandomForestRegressor(random_state=42))
])
rf
Pipeline(steps=[('preprocess', ColumnTransformer(transformers=[('numerical', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())]), ['general', 'gdp', 'fdi', 'i', 'rr']), ('categorical', Pipeline(steps=[('imputer', SimpleImputer(fill_value='sk_missing', strategy='constant')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))]), ['province', 'reg'])])), ('log_reg', RandomForestRegressor(random_state=42))])
ColumnTransformer(transformers=[('numerical', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())]), ['general', 'gdp', 'fdi', 'i', 'rr']), ('categorical', Pipeline(steps=[('imputer', SimpleImputer(fill_value='sk_missing', strategy='constant')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))]), ['province', 'reg'])])
['general', 'gdp', 'fdi', 'i', 'rr']
SimpleImputer()
StandardScaler()
['province', 'reg']
SimpleImputer(fill_value='sk_missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
Pipeline(steps=[('preprocess', ColumnTransformer(transformers=[('numerical', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())]), ['general', 'gdp', 'fdi', 'i', 'rr']), ('categorical', Pipeline(steps=[('imputer', SimpleImputer(fill_value='sk_missing', strategy='constant')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))]), ['province', 'reg'])])), ('log_reg', RandomForestRegressor(random_state=42))])
ColumnTransformer(transformers=[('numerical', Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())]), ['general', 'gdp', 'fdi', 'i', 'rr']), ('categorical', Pipeline(steps=[('imputer', SimpleImputer(fill_value='sk_missing', strategy='constant')), ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))]), ['province', 'reg'])])
['general', 'gdp', 'fdi', 'i', 'rr']
SimpleImputer()
StandardScaler()
['province', 'reg']
SimpleImputer(fill_value='sk_missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
RandomForestRegressor(random_state=42)
rf.score(X_train, y_train)
0.9503718502648469