#code adapted from https://github.com/thomasjpfan/ml-workshop-intermediate-1-of-2
import pandas as pd
url = 'https://raw.githubusercontent.com/davidrkearney/Kearney_Data_Science/master/_notebooks/df_panel_fix.csv'
df = pd.read_csv(url, error_bad_lines=False)
df

import pandas as pd
import sklearn
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split



df.columns

sklearn.set_config(display='diagram')

df=df.dropna()

df.isnull().sum()



X, y = df.drop(['specific', 'Unnamed: 0'], axis = 1), df['specific']

X


_ = X.hist(figsize=(30, 15), layout=(5, 8))
import numpy as np
import sklearn
sklearn.set_config(display='diagram')

Categorical Data

OridinalEncoder

from sklearn.preprocessing import OrdinalEncoder
ord_encoder = OrdinalEncoder()
ord_encoder.fit_transform(df)
array([[  0.,   0.,  17., ...,  24.,   0.,  28.],
       [  1.,   0.,  48., ...,  29.,   0.,  60.],
       [  2.,   0.,  64., ...,  40.,   0.,  67.],
       ...,
       [115.,  27.,  44., ...,  97.,   0.,  53.],
       [116.,  27.,  84., ...,   4.,   0.,  64.],
       [117.,  27.,  78., ...,  25.,   0.,  71.]])
ord_encoder.categories_
[array([  4,   6,   7,  10,  11,  16,  18,  19,  22,  23,  34,  35,  40,
         42,  43,  46,  47,  52,  54,  58,  64,  66,  67,  70,  71,  76,
         78,  79,  82,  83,  88,  90,  91,  94,  95, 107, 112, 119, 124,
        126, 127, 130, 131, 136, 138, 139, 142, 143, 148, 150, 151, 154,
        155, 160, 162, 163, 166, 167, 172, 174, 175, 178, 179, 184, 186,
        187, 190, 191, 196, 198, 199, 202, 203, 220, 222, 223, 226, 227,
        232, 234, 235, 239, 244, 246, 247, 250, 251, 258, 259, 262, 263,
        280, 282, 283, 292, 294, 295, 298, 310, 316, 318, 319, 322, 323,
        328, 330, 331, 334, 335, 340, 342, 343, 346, 347, 354, 355, 358,
        359]),
 array(['Anhui', 'Beijing', 'Chongqing', 'Fujian', 'Gansu', 'Guangdong',
        'Guangxi', 'Guizhou', 'Hainan', 'Hebei', 'Heilongjiang', 'Henan',
        'Hubei', 'Hunan', 'Jiangsu', 'Jiangxi', 'Jilin', 'Ningxia',
        'Qinghai', 'Shaanxi', 'Shandong', 'Shanxi', 'Sichuan', 'Tianjin',
        'Tibet', 'Xinjiang', 'Yunnan', 'Zhejiang'], dtype=object),
 array([  72939.,   91405.,   96825.,  103041.,  107046.,  107687.,
         119536.,  127819.,  133858.,  137190.,  142650.,  144839.,
         147749.,  148812.,  160637.,  179235.,  181409.,  195580.,
         207615.,  217707.,  223984.,  228043.,  237948.,  248903.,
         251539.,  254002.,  262197.,  265770.,  271297.,  271499.,
         281769.,  295133.,  319710.,  331999.,  337894.,  340372.,
         353776.,  354775.,  359275.,  365437.,  367568.,  369552.,
         370049.,  379186.,  391292.,  395775.,  397517.,  430577.,
         434149.,  436189.,  447709.,  458201.,  469514.,  472786.,
         484715.,  487155.,  510656.,  518022.,  531994.,  558569.,
         575550.,  591088.,  601485.,  615593.,  619201.,  642581.,
         675931.,  681676.,  684767.,  685732.,  688887.,  714572.,
         752279.,  753552.,  761081.,  821750.,  833423.,  833430.,
         844647.,  859482.,  875877.,  909559.,  978069.,  985851.,
        1017303., 1035872., 1097470., 1188989., 1204547., 1207353.,
        1224179., 1235386., 1246484., 1315102., 1331590., 1358528.,
        1388043., 1457872., 1550764., 1562694., 1791403., 1890650.,
        1897575., 1956261., 2018158., 2022957., 2045869., 2121243.,
        2213991., 2225220., 2482173., 2663667., 2669238., 2922784.,
        2981235., 3156087., 3847672., 3860764.]),
 array([      0.,    2990.,    8115.,   11755.,   11767.,   17400.,
          20842.,   26300.,   27302.,   27387.,   29646.,   30705.,
          32100.,   32119.,   32868.,   34842.,   36670.,   36946.,
          40604.,   44623.,   45683.,   50097.,   52108.,   53900.,
          56070.,   58533.,   59263.,   60560.,   60906.,   62000.,
          66100.,   66529.,   68142.,   71807.,   80361.,   80609.,
          81879.,   86256.,   88007.,   93323.,   95648.,  100000.,
         100900.,  107658.,  108624.,  112137.,  113000.,  114418.,
         116000.,  119658.,  123317.,  123546.,  124582.,  124647.,
         129791.,  135765.,  143800.,  145000.,  149549.,  150000.,
         153640.,  154364.,  165071.,  173552.,  173556.,  178705.,
         179252.,  188633.,  197539.,  229895.,  241282.,  260313.,
         264185.,  269596.,  279052.,  280277.,  302600.,  309582.,
         317700.,  320627.,  321686.,  363054.,  394795.,  400000.,
         405966.,  423049.,  429591.,  434318.,  447900.,  460668.,
         498913.,  516342.,  527300.,  540479.,  564400.,  570723.,
         581800.,  605400.,  655919.,  659400.,  694400.,  763953.,
        1016400., 1023453., 1046700., 1081000., 1131615., 1187958.,
        1197400., 1214100., 1239200., 1263500., 1272600., 1329200.,
        1737800.]),
 array([2000, 2002, 2003, 2006, 2007]),
 array([  117.8 ,   162.04,   185.09,   263.68,   290.76,   295.02,
          340.65,   341.43,   377.16,   390.2 ,   445.36,   725.9 ,
          797.35,   919.11,  1029.92,  1052.88,  1232.03,  1243.43,
         1254.17,  1363.56,  1426.34,  1612.65,  1672.96,  1804.  ,
         1845.72,  1853.65,  1886.35,  2011.19,  2080.04,  2120.35,
         2175.68,  2253.39,  2277.35,  2312.82,  2324.8 ,  2338.98,
         2348.54,  2450.48,  2523.73,  2556.02,  2587.72,  2821.11,
         2855.23,  2884.11,  2902.09,  3045.26,  3151.4 ,  3161.66,
         3519.72,  3523.16,  3545.39,  3551.49,  3620.27,  3637.2 ,
         3764.54,  3907.23,  3923.11,  3928.2 ,  3988.14,  4056.76,
         4057.4 ,  4151.54,  4212.82,  4275.12,  4315.  ,  4462.74,
         4467.55,  4659.99,  4676.13,  4725.01,  4743.61,  4746.16,
         4757.45,  4772.52,  4820.53,  4983.67,  5007.21,  5043.96,
         5052.99,  5333.09,  5757.29,  5823.41,  6035.48,  6112.5 ,
         6211.8 ,  6867.7 ,  7104.  ,  7360.92,  7583.85,  7617.47,
         7688.67,  7697.82,  8003.67,  8117.78,  8690.24,  9248.53,
         9333.4 ,  9439.6 ,  9456.84,  9705.02,  9846.81, 10275.5 ,
        10606.85, 10741.25, 12078.15, 12362.79, 13502.42, 13607.32,
        15012.46, 15718.47, 15844.64, 18598.69, 18753.73, 21742.05,
        21900.19, 25776.91, 26587.76, 31777.01]),
 array([      2,     293,     467,    1522,    1534,    1741,    1743,
           1899,    1911,    2200,    2418,    2501,    2522,    2954,
           3718,    3821,    4521,    4726,    5047,    6121,    6235,
           8384,    9384,   10366,   11020,   11169,   12484,   12651,
          12812,   21164,   21361,   22472,   24468,   28842,   30086,
          30120,   30234,   31000,   31847,   32080,   32180,   33190,
          33766,   35511,   36005,   36720,   38375,   39453,   39575,
          40463,   41231,   41726,   41856,   43694,   44740,   52466,
          53903,   55583,   56403,   66100,   67833,   67923,   68396,
          69595,   76064,   90022,   92489,   94368,  101835,  108197,
         108534,  112001,  119516,  120819,  139354,  142665,  156886,
         168368,  170801,  172464,  184526,  208508,  219126,  241621,
         242000,  244853,  259335,  259903,  276622,  280657,  299892,
         306162,  307610,  322047,  327051,  343191,  383837,  406058,
         413077,  455191,  473404,  498055,  506572,  601617,  607756,
         691482,  782294,  888935, 1000069, 1018960, 1036576, 1101159,
        1128091, 1133400, 1318339, 1451065, 1712603, 1743140]),
 array([0.        , 0.02702703, 0.03      , 0.03125   , 0.04761905,
        0.09677419, 0.20512821, 0.22      , 0.4       , 1.21428571]),
 array([0.        , 0.03      , 0.03571429, 0.10869565, 0.11111111,
        0.13      , 0.13888889, 0.15384615, 0.16      , 0.24      ,
        0.27027027, 0.3       , 0.31      , 0.4       , 0.41025641,
        0.4375    , 0.5       , 0.7948718 ]),
 array([0.        , 0.03571429, 0.05128205, 0.12820513, 0.13      ,
        0.21621622, 0.22222222, 0.23076923, 0.27586207, 0.3       ,
        0.32432432, 0.4       , 0.40625   , 0.47      , 0.51612903,
        0.53      , 0.55      , 0.71052632, 0.8125    ]),
 array(['1060812', '1082935', '1089674', '1108348', '11537149', '1163113',
        '11673659', '118013', '1212843', '123888', '1292604', '1310512',
        '1321004', '1389153', '1443753', '147235', '14740022', '14926380',
        '1514364', '1514799', '1543658', '1548155', '157652', '1600475',
        '1601508', '16494981', '1667114', '16753980', '1675757', '1677840',
        '16804703', '169770', '1710605', '1723026', '1755299', '1762409',
        '1802055', '1807967', '1841592', '1851377', '1896479', '1913563',
        '1925862', '1938812', '201412', '2018672', '2024337', '202761',
        '2110577', '2125369', '2195820', '22377276', '2308652', '2329505',
        '233299', '2373047', '2419708', '2450874', '2511249', '2523352',
        '2525301', '2567976', '2648861', '27858007', '2823366', '2823413',
        '2851375', '2858600', '2972212', '3206892', '3434548', '3444533',
        '3816261', '3898510', '4032810', '4188265', '4247403', '4404689',
        '4427000', '4468640', '447643', '4752398', '4830320', '4830392',
        '4867146', '4958329', '505196', '50819', '5145006', '5596906',
        '567083', '5903552', '597159', '59841', '6065508', '6166904',
        '6212824', '6217715', '6879383', '693750', '6994577', '70048',
        '7071605', '740947', '776120', '7891198', '800312', '830159',
        '8620804', '8818088', '919235', '924080', '932549', '960708',
        '966606', '971485', '974325', '9898522'], dtype=object),
 array(['East China', 'North China', 'Northeast China', 'Northwest China',
        'South Central China', 'Southwest China'], dtype=object),
 array([  475184,   546541,   632880,   736165,   757990,   819028,
          866691,   948521,   976396,  1047698,  1078754,  1109537,
         1174622,  1184990,  1210637,  1216605,  1228569,  1258100,
         1308445,  1333133,  1364344,  1364980,  1423771,  1426600,
         1428990,  1440939,  1472622,  1492835,  1499110,  1554999,
         1648826,  1658350,  1742585,  1782317,  1845611,  1873822,
         1898911,  1927102,  1962192,  1962633,  1986738,  2017594,
         2023674,  2047192,  2052220,  2053980,  2072426,  2135224,
         2138158,  2138758,  2143190,  2150325,  2254281,  2261631,
         2268499,  2339769,  2347862,  2355164,  2376983,  2378616,
         2404936,  2444270,  2455900,  2545841,  2553268,  2649011,
         2764053,  2815820,  2867525,  2907301,  2926542,  2939778,
         2940367,  2977880,  3035767,  3051103,  3101537,  3114638,
         3124234,  3343228,  3388449,  3545004,  3557071,  3586373,
         3847158,  3893879,  3923569,  4039036,  4062020,  4073606,
         4133488,  4229821,  4390259,  4559252,  4607955,  4613724,
         4686125,  4947824,  5046865,  5167300,  5304833,  5502470,
         5639838,  6003791,  6033279,  6185600,  6308151,  6349262,
         6357869,  6832541,  7040099,  7537692,  7601825,  7646885,
         7666512,  7968319,  8340692, 10533312])]
ord_encoder.transform(df)
array([[  0.,   0.,  17., ...,  24.,   0.,  28.],
       [  1.,   0.,  48., ...,  29.,   0.,  60.],
       [  2.,   0.,  64., ...,  40.,   0.,  67.],
       ...,
       [115.,  27.,  44., ...,  97.,   0.,  53.],
       [116.,  27.,  84., ...,   4.,   0.,  64.],
       [117.,  27.,  78., ...,  25.,   0.,  71.]])

Categories that are unknown during fit

How to handle unknown categories in OridinalEncoder?

Provide all the categories in the constructor

OneHotEncoder

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()
X_trans = ohe.fit_transform(df)
X_trans
<118x909 sparse matrix of type '<class 'numpy.float64'>'
	with 1534 stored elements in Compressed Sparse Row format>
X_trans.toarray()
array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Switch to dense

ohe = OneHotEncoder(sparse=False)
ohe.fit_transform(df)
array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

Unknown categories during transform?

OHE can handle unknowns

ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe.fit(df)
OneHotEncoder(handle_unknown='ignore', sparse=False)
ohe.transform(df)
array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])
ohe.categories_
[array([  4,   6,   7,  10,  11,  16,  18,  19,  22,  23,  34,  35,  40,
         42,  43,  46,  47,  52,  54,  58,  64,  66,  67,  70,  71,  76,
         78,  79,  82,  83,  88,  90,  91,  94,  95, 107, 112, 119, 124,
        126, 127, 130, 131, 136, 138, 139, 142, 143, 148, 150, 151, 154,
        155, 160, 162, 163, 166, 167, 172, 174, 175, 178, 179, 184, 186,
        187, 190, 191, 196, 198, 199, 202, 203, 220, 222, 223, 226, 227,
        232, 234, 235, 239, 244, 246, 247, 250, 251, 258, 259, 262, 263,
        280, 282, 283, 292, 294, 295, 298, 310, 316, 318, 319, 322, 323,
        328, 330, 331, 334, 335, 340, 342, 343, 346, 347, 354, 355, 358,
        359]),
 array(['Anhui', 'Beijing', 'Chongqing', 'Fujian', 'Gansu', 'Guangdong',
        'Guangxi', 'Guizhou', 'Hainan', 'Hebei', 'Heilongjiang', 'Henan',
        'Hubei', 'Hunan', 'Jiangsu', 'Jiangxi', 'Jilin', 'Ningxia',
        'Qinghai', 'Shaanxi', 'Shandong', 'Shanxi', 'Sichuan', 'Tianjin',
        'Tibet', 'Xinjiang', 'Yunnan', 'Zhejiang'], dtype=object),
 array([  72939.,   91405.,   96825.,  103041.,  107046.,  107687.,
         119536.,  127819.,  133858.,  137190.,  142650.,  144839.,
         147749.,  148812.,  160637.,  179235.,  181409.,  195580.,
         207615.,  217707.,  223984.,  228043.,  237948.,  248903.,
         251539.,  254002.,  262197.,  265770.,  271297.,  271499.,
         281769.,  295133.,  319710.,  331999.,  337894.,  340372.,
         353776.,  354775.,  359275.,  365437.,  367568.,  369552.,
         370049.,  379186.,  391292.,  395775.,  397517.,  430577.,
         434149.,  436189.,  447709.,  458201.,  469514.,  472786.,
         484715.,  487155.,  510656.,  518022.,  531994.,  558569.,
         575550.,  591088.,  601485.,  615593.,  619201.,  642581.,
         675931.,  681676.,  684767.,  685732.,  688887.,  714572.,
         752279.,  753552.,  761081.,  821750.,  833423.,  833430.,
         844647.,  859482.,  875877.,  909559.,  978069.,  985851.,
        1017303., 1035872., 1097470., 1188989., 1204547., 1207353.,
        1224179., 1235386., 1246484., 1315102., 1331590., 1358528.,
        1388043., 1457872., 1550764., 1562694., 1791403., 1890650.,
        1897575., 1956261., 2018158., 2022957., 2045869., 2121243.,
        2213991., 2225220., 2482173., 2663667., 2669238., 2922784.,
        2981235., 3156087., 3847672., 3860764.]),
 array([      0.,    2990.,    8115.,   11755.,   11767.,   17400.,
          20842.,   26300.,   27302.,   27387.,   29646.,   30705.,
          32100.,   32119.,   32868.,   34842.,   36670.,   36946.,
          40604.,   44623.,   45683.,   50097.,   52108.,   53900.,
          56070.,   58533.,   59263.,   60560.,   60906.,   62000.,
          66100.,   66529.,   68142.,   71807.,   80361.,   80609.,
          81879.,   86256.,   88007.,   93323.,   95648.,  100000.,
         100900.,  107658.,  108624.,  112137.,  113000.,  114418.,
         116000.,  119658.,  123317.,  123546.,  124582.,  124647.,
         129791.,  135765.,  143800.,  145000.,  149549.,  150000.,
         153640.,  154364.,  165071.,  173552.,  173556.,  178705.,
         179252.,  188633.,  197539.,  229895.,  241282.,  260313.,
         264185.,  269596.,  279052.,  280277.,  302600.,  309582.,
         317700.,  320627.,  321686.,  363054.,  394795.,  400000.,
         405966.,  423049.,  429591.,  434318.,  447900.,  460668.,
         498913.,  516342.,  527300.,  540479.,  564400.,  570723.,
         581800.,  605400.,  655919.,  659400.,  694400.,  763953.,
        1016400., 1023453., 1046700., 1081000., 1131615., 1187958.,
        1197400., 1214100., 1239200., 1263500., 1272600., 1329200.,
        1737800.]),
 array([2000, 2002, 2003, 2006, 2007]),
 array([  117.8 ,   162.04,   185.09,   263.68,   290.76,   295.02,
          340.65,   341.43,   377.16,   390.2 ,   445.36,   725.9 ,
          797.35,   919.11,  1029.92,  1052.88,  1232.03,  1243.43,
         1254.17,  1363.56,  1426.34,  1612.65,  1672.96,  1804.  ,
         1845.72,  1853.65,  1886.35,  2011.19,  2080.04,  2120.35,
         2175.68,  2253.39,  2277.35,  2312.82,  2324.8 ,  2338.98,
         2348.54,  2450.48,  2523.73,  2556.02,  2587.72,  2821.11,
         2855.23,  2884.11,  2902.09,  3045.26,  3151.4 ,  3161.66,
         3519.72,  3523.16,  3545.39,  3551.49,  3620.27,  3637.2 ,
         3764.54,  3907.23,  3923.11,  3928.2 ,  3988.14,  4056.76,
         4057.4 ,  4151.54,  4212.82,  4275.12,  4315.  ,  4462.74,
         4467.55,  4659.99,  4676.13,  4725.01,  4743.61,  4746.16,
         4757.45,  4772.52,  4820.53,  4983.67,  5007.21,  5043.96,
         5052.99,  5333.09,  5757.29,  5823.41,  6035.48,  6112.5 ,
         6211.8 ,  6867.7 ,  7104.  ,  7360.92,  7583.85,  7617.47,
         7688.67,  7697.82,  8003.67,  8117.78,  8690.24,  9248.53,
         9333.4 ,  9439.6 ,  9456.84,  9705.02,  9846.81, 10275.5 ,
        10606.85, 10741.25, 12078.15, 12362.79, 13502.42, 13607.32,
        15012.46, 15718.47, 15844.64, 18598.69, 18753.73, 21742.05,
        21900.19, 25776.91, 26587.76, 31777.01]),
 array([      2,     293,     467,    1522,    1534,    1741,    1743,
           1899,    1911,    2200,    2418,    2501,    2522,    2954,
           3718,    3821,    4521,    4726,    5047,    6121,    6235,
           8384,    9384,   10366,   11020,   11169,   12484,   12651,
          12812,   21164,   21361,   22472,   24468,   28842,   30086,
          30120,   30234,   31000,   31847,   32080,   32180,   33190,
          33766,   35511,   36005,   36720,   38375,   39453,   39575,
          40463,   41231,   41726,   41856,   43694,   44740,   52466,
          53903,   55583,   56403,   66100,   67833,   67923,   68396,
          69595,   76064,   90022,   92489,   94368,  101835,  108197,
         108534,  112001,  119516,  120819,  139354,  142665,  156886,
         168368,  170801,  172464,  184526,  208508,  219126,  241621,
         242000,  244853,  259335,  259903,  276622,  280657,  299892,
         306162,  307610,  322047,  327051,  343191,  383837,  406058,
         413077,  455191,  473404,  498055,  506572,  601617,  607756,
         691482,  782294,  888935, 1000069, 1018960, 1036576, 1101159,
        1128091, 1133400, 1318339, 1451065, 1712603, 1743140]),
 array([0.        , 0.02702703, 0.03      , 0.03125   , 0.04761905,
        0.09677419, 0.20512821, 0.22      , 0.4       , 1.21428571]),
 array([0.        , 0.03      , 0.03571429, 0.10869565, 0.11111111,
        0.13      , 0.13888889, 0.15384615, 0.16      , 0.24      ,
        0.27027027, 0.3       , 0.31      , 0.4       , 0.41025641,
        0.4375    , 0.5       , 0.7948718 ]),
 array([0.        , 0.03571429, 0.05128205, 0.12820513, 0.13      ,
        0.21621622, 0.22222222, 0.23076923, 0.27586207, 0.3       ,
        0.32432432, 0.4       , 0.40625   , 0.47      , 0.51612903,
        0.53      , 0.55      , 0.71052632, 0.8125    ]),
 array(['1060812', '1082935', '1089674', '1108348', '11537149', '1163113',
        '11673659', '118013', '1212843', '123888', '1292604', '1310512',
        '1321004', '1389153', '1443753', '147235', '14740022', '14926380',
        '1514364', '1514799', '1543658', '1548155', '157652', '1600475',
        '1601508', '16494981', '1667114', '16753980', '1675757', '1677840',
        '16804703', '169770', '1710605', '1723026', '1755299', '1762409',
        '1802055', '1807967', '1841592', '1851377', '1896479', '1913563',
        '1925862', '1938812', '201412', '2018672', '2024337', '202761',
        '2110577', '2125369', '2195820', '22377276', '2308652', '2329505',
        '233299', '2373047', '2419708', '2450874', '2511249', '2523352',
        '2525301', '2567976', '2648861', '27858007', '2823366', '2823413',
        '2851375', '2858600', '2972212', '3206892', '3434548', '3444533',
        '3816261', '3898510', '4032810', '4188265', '4247403', '4404689',
        '4427000', '4468640', '447643', '4752398', '4830320', '4830392',
        '4867146', '4958329', '505196', '50819', '5145006', '5596906',
        '567083', '5903552', '597159', '59841', '6065508', '6166904',
        '6212824', '6217715', '6879383', '693750', '6994577', '70048',
        '7071605', '740947', '776120', '7891198', '800312', '830159',
        '8620804', '8818088', '919235', '924080', '932549', '960708',
        '966606', '971485', '974325', '9898522'], dtype=object),
 array(['East China', 'North China', 'Northeast China', 'Northwest China',
        'South Central China', 'Southwest China'], dtype=object),
 array([  475184,   546541,   632880,   736165,   757990,   819028,
          866691,   948521,   976396,  1047698,  1078754,  1109537,
         1174622,  1184990,  1210637,  1216605,  1228569,  1258100,
         1308445,  1333133,  1364344,  1364980,  1423771,  1426600,
         1428990,  1440939,  1472622,  1492835,  1499110,  1554999,
         1648826,  1658350,  1742585,  1782317,  1845611,  1873822,
         1898911,  1927102,  1962192,  1962633,  1986738,  2017594,
         2023674,  2047192,  2052220,  2053980,  2072426,  2135224,
         2138158,  2138758,  2143190,  2150325,  2254281,  2261631,
         2268499,  2339769,  2347862,  2355164,  2376983,  2378616,
         2404936,  2444270,  2455900,  2545841,  2553268,  2649011,
         2764053,  2815820,  2867525,  2907301,  2926542,  2939778,
         2940367,  2977880,  3035767,  3051103,  3101537,  3114638,
         3124234,  3343228,  3388449,  3545004,  3557071,  3586373,
         3847158,  3893879,  3923569,  4039036,  4062020,  4073606,
         4133488,  4229821,  4390259,  4559252,  4607955,  4613724,
         4686125,  4947824,  5046865,  5167300,  5304833,  5502470,
         5639838,  6003791,  6033279,  6185600,  6308151,  6349262,
         6357869,  6832541,  7040099,  7537692,  7601825,  7646885,
         7666512,  7968319,  8340692, 10533312])]

Two categorical features

df_train = pd.DataFrame({
    "province": ["Zhejiang", "Beijing", "Shanghai"],
    "region": ["East China", "North China", "Southwest China"]
})
ohe.fit(df_train)
OneHotEncoder(handle_unknown='ignore', sparse=False)
ohe.categories_
[array(['Beijing', 'Shanghai', 'Zhejiang'], dtype=object),
 array(['East China', 'North China', 'Southwest China'], dtype=object)]
ohe.transform(df_train)
array([[0., 0., 1., 1., 0., 0.],
       [1., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1.]])

Column Transformer!

With OridinalEncoder

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
df
Unnamed: 0 province specific general year gdp fdi rnr rr i fr reg it
4 4 Anhui 195580.0 32100.0 2000 2902.09 31847 0.000000 0.000000 0.000000 1601508 East China 1499110
6 6 Anhui 434149.0 66529.0 2002 3519.72 38375 0.000000 0.000000 0.000000 1677840 East China 2404936
7 7 Anhui 619201.0 52108.0 2003 3923.11 36720 0.000000 0.000000 0.000000 1896479 East China 2815820
10 10 Anhui 1457872.0 279052.0 2006 6112.50 139354 0.000000 0.000000 0.324324 3434548 East China 5167300
11 11 Anhui 2213991.0 178705.0 2007 7360.92 299892 0.000000 0.000000 0.324324 4468640 East China 7040099
... ... ... ... ... ... ... ... ... ... ... ... ... ...
347 347 Yunnan 2482173.0 564400.0 2007 4772.52 39453 0.000000 0.000000 0.000000 4867146 Southwest China 6832541
354 354 Zhejiang 365437.0 321686.0 2002 8003.67 307610 0.000000 0.000000 0.000000 4958329 East China 1962633
355 355 Zhejiang 391292.0 260313.0 2003 9705.02 498055 1.214286 0.035714 0.035714 6217715 East China 2261631
358 358 Zhejiang 1017303.0 394795.0 2006 15718.47 888935 1.214286 0.035714 0.035714 11537149 East China 2553268
359 359 Zhejiang 844647.0 0.0 2007 18753.73 1036576 0.047619 0.000000 0.000000 16494981 East China 2939778

118 rows × 13 columns

ct = ColumnTransformer([
    ('numerical', StandardScaler(), ['fdi', 'gdp']),
    ('categorical', OrdinalEncoder(), ['reg'])
])

ct.fit_transform(df)
array([[-0.54088554, -0.48498461,  0.        ],
       [-0.52313911, -0.37867596,  0.        ],
       [-0.52763824, -0.30924305,  0.        ],
       [-0.24862668,  0.06760246,  0.        ],
       [ 0.18779748,  0.28248491,  0.        ],
       [-0.16975183, -0.44030651,  1.        ],
       [-0.15861681, -0.24178957,  1.        ],
       [-0.0317657 , -0.12264395,  1.        ],
       [ 0.60997938,  0.41275831,  1.        ],
       [ 0.74965914,  0.71036505,  1.        ],
       [-0.43826722, -0.31197638,  5.        ],
       [-0.33241116, -0.1796306 ,  5.        ],
       [ 0.30550625, -0.33653668,  0.        ],
       [ 0.41600281, -0.21553212,  0.        ],
       [ 0.07908699, -0.12669573,  0.        ],
       [ 0.24802608,  0.3208564 ,  0.        ],
       [ 0.47641082,  0.60738699,  0.        ],
       [-0.61051202, -0.80327715,  3.        ],
       [-0.61082193, -0.77244122,  3.        ],
       [-0.61943145, -0.59251706,  3.        ],
       [ 2.43926479,  0.86431921,  4.        ],
       [ 2.45369736,  1.33958151,  4.        ],
       [ 1.49921217,  1.74273266,  4.        ],
       [ 3.31727285,  3.5918763 ,  4.        ],
       [ 4.02826654,  4.48506828,  4.        ],
       [-0.48483258, -0.62647875,  4.        ],
       [-0.51402938, -0.55010927,  4.        ],
       [-0.51367597, -0.49892317,  4.        ],
       [-0.50583579, -0.16757679,  4.        ],
       [-0.44152672,  0.01784327,  4.        ],
       [-0.62066294, -0.8072291 ,  5.        ],
       [-0.6170745 , -0.77047901,  5.        ],
       [-0.61517155, -0.7389959 ,  5.        ],
       [-0.60195143, -0.58190909,  5.        ],
       [-0.59307006, -0.48807939,  5.        ],
       [-0.32298609, -0.7686304 ,  4.        ],
       [-0.44281257, -0.11631841,  1.        ],
       [ 0.02938719,  1.35763727,  1.        ],
       [-0.54567284, -0.4420725 ,  2.        ],
       [-0.53092492, -0.35845489,  2.        ],
       [-0.53998028, -0.28612859,  2.        ],
       [-0.1631377 ,  0.08469433,  2.        ],
       [-0.06063084,  0.23826293,  2.        ],
       [-0.47412981, -0.11476413,  4.        ],
       [-0.51746286,  0.05434551,  4.        ],
       [-0.48092608,  0.19759014,  4.        ],
       [-0.12582615,  1.14342438,  4.        ],
       [ 0.20484254,  1.59949491,  4.        ],
       [-0.37092157, -0.37425755,  4.        ],
       [-0.23962569, -0.25937715,  4.        ],
       [-0.20096576, -0.16563352,  4.        ],
       [ 0.03817341,  0.3266432 ,  4.        ],
       [ 0.12453776,  0.62199511,  4.        ],
       [-0.44305724, -0.3732076 ,  4.        ],
       [-0.38273622, -0.26992488,  4.        ],
       [-0.35062246, -0.18240867,  4.        ],
       [ 0.07754287,  0.33889839,  4.        ],
       [ 0.2616295 ,  0.64027463,  4.        ],
       [ 1.02472886,  0.34047332,  0.        ],
       [ 1.25233884,  0.64324204,  0.        ],
       [ 2.14259107,  0.84118581,  0.        ],
       [ 2.95645589,  2.21676944,  0.        ],
       [ 4.11128168,  2.75781563,  0.        ],
       [-0.54025213, -0.6654458 ,  0.        ],
       [-0.5198769 , -0.61001686,  0.        ],
       [-0.3333273 , -0.56271731,  0.        ],
       [ 0.0304175 , -0.28623875,  0.        ],
       [ 0.13550694, -0.15477596,  0.        ],
       [-0.54558041, -0.69654679,  2.        ],
       [-0.53566872, -0.61954045,  2.        ],
       [-0.56094543, -0.58026359,  2.        ],
       [-0.44776842, -0.36136894,  2.        ],
       [-0.42068118, -0.24865385,  2.        ],
       [-0.62272901, -0.93372268,  3.        ],
       [-0.62148121, -0.91958445,  3.        ],
       [-0.62272357, -0.90784563,  3.        ],
       [-0.61735451, -0.8595581 ,  3.        ],
       [-0.61374161, -0.82630211,  3.        ],
       [-0.59750395, -0.93911703,  3.        ],
       [-0.61461425, -0.92586868,  3.        ],
       [-0.62060585, -0.91733996,  3.        ],
       [-0.54318812, -0.84725987,  3.        ],
       [-0.54905466, -0.67399173,  3.        ],
       [-0.52958198, -0.59664114,  3.        ],
       [-0.53723458, -0.53909508,  3.        ],
       [-0.37602966, -0.16801571,  3.        ],
       [-0.30255648,  0.00646247,  3.        ],
       [ 0.6594916 ,  0.78415268,  0.        ],
       [ 1.00803993,  1.09443114,  0.        ],
       [ 2.0912357 ,  2.78503525,  0.        ],
       [ 2.36604988,  3.45230994,  0.        ],
       [-0.56637157, -0.66681073,  1.        ],
       [-0.56992738, -0.5843498 ,  1.        ],
       [-0.56939184, -0.49305032,  1.        ],
       [-0.50867935, -0.30836695,  5.        ],
       [-0.47635899, -0.1712172 ,  5.        ],
       [-0.51537504, -0.06655233,  5.        ],
       [-0.29901427,  0.51129215,  5.        ],
       [ 0.49549204, -0.21636004,  1.        ],
       [-0.62745649, -0.96422641,  5.        ],
       [-0.62666541, -0.95661166,  5.        ],
       [-0.62619239, -0.95264422,  5.        ],
       [-0.62332436, -0.93445592,  5.        ],
       [-0.62088857, -0.92573443,  5.        ],
       [-0.62226686, -0.74980181,  3.        ],
       [-0.62229948, -0.70692756,  3.        ],
       [-0.62329174, -0.65981736,  3.        ],
       [-0.59928186, -0.46034169,  3.        ],
       [-0.59352405, -0.37808386,  3.        ],
       [-0.59263238, -0.63832946,  5.        ],
       [-0.59709889, -0.58641184,  5.        ],
       [-0.60466994, -0.5445514 ,  5.        ],
       [-0.5452705 , -0.29804986,  5.        ],
       [-0.52020855, -0.16303961,  5.        ],
       [ 0.20877895,  0.3931173 ,  0.        ],
       [ 0.72650559,  0.68595965,  0.        ],
       [ 1.7891168 ,  1.72101584,  0.        ],
       [ 2.19048034,  2.24345547,  0.        ]])

With OneHotEncoder

ct = ColumnTransformer([
    ('numerical', StandardScaler(), ['fdi', 'gdp']),
    ('categorical', OneHotEncoder(), ['reg'])
])
ct.fit_transform(df)
array([[-0.54088554, -0.48498461,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.52313911, -0.37867596,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.52763824, -0.30924305,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.24862668,  0.06760246,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.18779748,  0.28248491,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.16975183, -0.44030651,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.15861681, -0.24178957,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.0317657 , -0.12264395,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.60997938,  0.41275831,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.74965914,  0.71036505,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.43826722, -0.31197638,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.33241116, -0.1796306 ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [ 0.30550625, -0.33653668,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.41600281, -0.21553212,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.07908699, -0.12669573,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.24802608,  0.3208564 ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.47641082,  0.60738699,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.61051202, -0.80327715,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.61082193, -0.77244122,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.61943145, -0.59251706,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [ 2.43926479,  0.86431921,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [ 2.45369736,  1.33958151,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [ 1.49921217,  1.74273266,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [ 3.31727285,  3.5918763 ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [ 4.02826654,  4.48506828,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.48483258, -0.62647875,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.51402938, -0.55010927,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.51367597, -0.49892317,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.50583579, -0.16757679,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.44152672,  0.01784327,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.62066294, -0.8072291 ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.6170745 , -0.77047901,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.61517155, -0.7389959 ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.60195143, -0.58190909,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.59307006, -0.48807939,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.32298609, -0.7686304 ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.44281257, -0.11631841,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.02938719,  1.35763727,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.54567284, -0.4420725 ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [-0.53092492, -0.35845489,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [-0.53998028, -0.28612859,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [-0.1631377 ,  0.08469433,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [-0.06063084,  0.23826293,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [-0.47412981, -0.11476413,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.51746286,  0.05434551,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.48092608,  0.19759014,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.12582615,  1.14342438,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [ 0.20484254,  1.59949491,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.37092157, -0.37425755,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.23962569, -0.25937715,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.20096576, -0.16563352,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [ 0.03817341,  0.3266432 ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [ 0.12453776,  0.62199511,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.44305724, -0.3732076 ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.38273622, -0.26992488,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [-0.35062246, -0.18240867,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [ 0.07754287,  0.33889839,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [ 0.2616295 ,  0.64027463,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ],
       [ 1.02472886,  0.34047332,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 1.25233884,  0.64324204,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 2.14259107,  0.84118581,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 2.95645589,  2.21676944,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 4.11128168,  2.75781563,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.54025213, -0.6654458 ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.5198769 , -0.61001686,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.3333273 , -0.56271731,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.0304175 , -0.28623875,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.13550694, -0.15477596,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.54558041, -0.69654679,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [-0.53566872, -0.61954045,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [-0.56094543, -0.58026359,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [-0.44776842, -0.36136894,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [-0.42068118, -0.24865385,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ],
       [-0.62272901, -0.93372268,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.62148121, -0.91958445,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.62272357, -0.90784563,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.61735451, -0.8595581 ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.61374161, -0.82630211,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.59750395, -0.93911703,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.61461425, -0.92586868,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.62060585, -0.91733996,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.54318812, -0.84725987,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.54905466, -0.67399173,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.52958198, -0.59664114,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.53723458, -0.53909508,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.37602966, -0.16801571,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.30255648,  0.00646247,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [ 0.6594916 ,  0.78415268,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 1.00803993,  1.09443114,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 2.0912357 ,  2.78503525,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 2.36604988,  3.45230994,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.56637157, -0.66681073,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.56992738, -0.5843498 ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.56939184, -0.49305032,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.50867935, -0.30836695,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.47635899, -0.1712172 ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.51537504, -0.06655233,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.29901427,  0.51129215,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [ 0.49549204, -0.21636004,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [-0.62745649, -0.96422641,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.62666541, -0.95661166,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.62619239, -0.95264422,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.62332436, -0.93445592,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.62088857, -0.92573443,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.62226686, -0.74980181,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.62229948, -0.70692756,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.62329174, -0.65981736,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.59928186, -0.46034169,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.59352405, -0.37808386,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ],
       [-0.59263238, -0.63832946,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.59709889, -0.58641184,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.60466994, -0.5445514 ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.5452705 , -0.29804986,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [-0.52020855, -0.16303961,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ],
       [ 0.20877895,  0.3931173 ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.72650559,  0.68595965,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 1.7891168 ,  1.72101584,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 2.19048034,  2.24345547,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ]])
# df
y
4       195580.0
6       434149.0
7       619201.0
10     1457872.0
11     2213991.0
         ...    
347    2482173.0
354     365437.0
355     391292.0
358    1017303.0
359     844647.0
Name: specific, Length: 118, dtype: float64
X.head()
province general year gdp fdi rnr rr i fr reg it
4 Anhui 32100.0 2000 2902.09 31847 0.0 0.0 0.000000 1601508 East China 1499110
6 Anhui 66529.0 2002 3519.72 38375 0.0 0.0 0.000000 1677840 East China 2404936
7 Anhui 52108.0 2003 3923.11 36720 0.0 0.0 0.000000 1896479 East China 2815820
10 Anhui 279052.0 2006 6112.50 139354 0.0 0.0 0.324324 3434548 East China 5167300
11 Anhui 178705.0 2007 7360.92 299892 0.0 0.0 0.324324 4468640 East China 7040099

Are three categories already encoded in the dataset?

X.dtypes
province     object
general     float64
year          int64
gdp         float64
fdi           int64
rnr         float64
rr          float64
i           float64
fr           object
reg          object
it            int64
dtype: object

Are there missing values in the dataset?

missing_values = pd.concat({"na_cnt": X.isna().sum(), "dtypes": X.dtypes}, axis='columns')
missing_values
na_cnt dtypes
province 0 object
general 0 float64
year 0 int64
gdp 0 float64
fdi 0 int64
rnr 0 float64
rr 0 float64
i 0 float64
fr 0 object
reg 0 object
it 0 int64

Split data into training and test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42)

ColumnTransformer

missing_values
na_cnt dtypes
province 0 object
general 0 float64
year 0 int64
gdp 0 float64
fdi 0 int64
rnr 0 float64
rr 0 float64
i 0 float64
fr 0 object
reg 0 object
it 0 int64
X
province general year gdp fdi rnr rr i fr reg it
4 Anhui 32100.0 2000 2902.09 31847 0.000000 0.000000 0.000000 1601508 East China 1499110
6 Anhui 66529.0 2002 3519.72 38375 0.000000 0.000000 0.000000 1677840 East China 2404936
7 Anhui 52108.0 2003 3923.11 36720 0.000000 0.000000 0.000000 1896479 East China 2815820
10 Anhui 279052.0 2006 6112.50 139354 0.000000 0.000000 0.324324 3434548 East China 5167300
11 Anhui 178705.0 2007 7360.92 299892 0.000000 0.000000 0.324324 4468640 East China 7040099
... ... ... ... ... ... ... ... ... ... ... ...
347 Yunnan 564400.0 2007 4772.52 39453 0.000000 0.000000 0.000000 4867146 Southwest China 6832541
354 Zhejiang 321686.0 2002 8003.67 307610 0.000000 0.000000 0.000000 4958329 East China 1962633
355 Zhejiang 260313.0 2003 9705.02 498055 1.214286 0.035714 0.035714 6217715 East China 2261631
358 Zhejiang 394795.0 2006 15718.47 888935 1.214286 0.035714 0.035714 11537149 East China 2553268
359 Zhejiang 0.0 2007 18753.73 1036576 0.047619 0.000000 0.000000 16494981 East China 2939778

118 rows × 11 columns

Numerical preprocessing

numerical_features = ['general', 'gdp', 'fdi', 'i', 'rr']
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

num_prep = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])
num_prep
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler())])
SimpleImputer()
StandardScaler()

Running only on numerical features

num_trans = num_prep.fit_transform(X_train[numerical_features])
num_trans
array([[ 0.84445696, -0.23824571,  0.05133332, -0.46886997,  1.84019633],
       [ 0.24766853,  2.07395803,  1.57444985, -0.46886997, -0.43195321],
       [-0.72214407, -0.0446055 , -0.43939868, -0.46886997, -0.43195321],
       [-0.48856958, -0.11992158, -0.34379901, -0.46886997, -0.43195321],
       [ 0.85337791, -0.87758205, -0.54348651, -0.46886997, -0.43195321],
       [-0.62992082, -0.83196322, -0.62382665, -0.46886997,  0.17004293],
       [-0.83650183,  3.23074054,  4.28312405,  1.06326271, -0.43195321],
       [ 1.76828095, -0.19541419, -0.41644878, -0.46886997, -0.43195321],
       [ 1.78635605,  0.89747804,  0.79717537, -0.46886997,  3.97033652],
       [-0.01481382, -0.57518987, -0.60442312,  3.47737968, -0.43195321],
       [-0.33025804, -0.71766019, -0.62552372, -0.46886997, -0.43195321],
       [-0.75781248, -0.70583032, -0.54596728, -0.46886997, -0.43195321],
       [ 0.18859962, -0.10301925, -0.50475274, -0.46886997,  1.78339259],
       [ 3.61705142,  1.91072514,  0.2322093 , -0.46886997, -0.43195321],
       [-0.52047074, -0.52639929, -0.53731279, -0.46886997, -0.43195321],
       [-0.35308192, -0.41382042, -0.15623907,  2.47473995, -0.43195321],
       [ 0.51483816, -0.85369871, -0.61664935, -0.46886997, -0.43195321],
       [ 2.42485843, -0.08843151,  0.16030939, -0.46886997,  1.84019633],
       [-0.63055638, -0.3205428 , -0.53076977, -0.46886997, -0.43195321],
       [-0.81570508, -0.2076344 , -0.22869714,  0.25314756, -0.43195321],
       [-0.06101258, -0.78797637, -0.31514053, -0.46886997, -0.43195321],
       [-0.022315  ,  0.78012485,  0.51382125, -0.46886997, -0.43195321],
       [-0.37712312, -0.55331904, -0.3258642 ,  2.14150109, -0.43195321],
       [ 2.33926246, -0.46826211, -0.59521328,  3.47737968, -0.43195321],
       [ 0.62612061, -0.10351944, -0.37014586, -0.46886997, -0.43195321],
       [-0.1455937 , -0.05181405, -0.01314966, -0.46886997,  3.97033652],
       [-0.69280823, -0.04283426, -0.47187415, -0.46886997, -0.43195321],
       [-0.15946076, -0.25170559, -0.5456459 , -0.46886997, -0.43195321],
       [-0.80637666, -0.98226192, -0.59981116, -0.46886997, -0.43195321],
       [-0.45208878, -0.48061962, -0.51288289, -0.46886997,  1.78339259],
       [ 0.4867555 , -0.58727866, -0.62254962,  0.24317887, -0.43195321],
       [-0.74721037, -0.26346288, -0.50770147, -0.46886997, -0.26580227],
       [ 0.71498921, -0.96701118, -0.62406064,  0.76534802, -0.43195321],
       [-0.6776111 , -0.6259812 , -0.48297275,  1.75272242, -0.43195321],
       [ 0.20388901,  0.55832718,  0.6523297 , -0.46886997,  3.97033652],
       [-0.68462536, -0.15766887,  0.45117911, -0.46886997,  1.22955614],
       [-0.68130147, -0.59197843, -0.52937716, -0.46886997, -0.43195321],
       [-0.1213603 ,  0.16498962, -0.23803102,  1.33242116, -0.43195321],
       [-0.68649617, -0.8274596 , -0.61330032, -0.46886997,  0.42010287],
       [-0.73244378, -0.41583293, -0.54606312, -0.46886997, -0.43195321],
       [-0.75418874, -0.67194329, -0.56752738, -0.46886997, -0.43195321],
       [-0.11822093, -0.18759169, -0.14469223,  2.47473995, -0.43195321],
       [-0.5606009 , -0.89159703, -0.62039587, -0.46886997, -0.43195321],
       [-0.70811556, -0.99767742, -0.6295606 ,  1.33242116, -0.43195321],
       [-0.44276036, -0.47392695, -0.57065935, -0.46886997, -0.43195321],
       [-0.61544895, -0.21965453, -0.37710046, -0.46886997, -0.43195321],
       [-0.57791999, -0.10080471, -0.18860741, -0.46886997,  0.18342062],
       [ 2.23213936, -0.34291185, -0.59568406, -0.46886997, -0.43195321],
       [ 2.06354861,  1.39098988, -0.11068883, -0.46886997, -0.43195321],
       [-0.79190991, -0.9574449 , -0.62376745,  0.24317887, -0.43195321],
       [ 0.26443406,  2.61416753,  3.08558822,  1.06326271, -0.43195321],
       [-0.01209987,  0.53594439,  0.23629128, -0.46886997, -0.43195321],
       [-0.80634591, -0.96716418, -0.61755426, -0.46886997, -0.43195321],
       [-0.16938374,  0.8696658 ,  0.77316551, -0.27051351, -0.23415447],
       [-0.04311944,  0.18446741, -0.14938031,  0.73199078, -0.43195321],
       [-0.68041476,  0.47595163,  1.08241827, -0.18405044, -0.43195321],
       [-0.76631569, -0.33855142, -0.36484886, -0.46886997,  1.06490206],
       [-0.71942755, -1.0021987 , -0.63005112, -0.46886997,  1.28493979],
       [ 2.56991029,  0.81760341,  0.29109646, -0.46886997,  1.99108126],
       [-0.74181833, -0.94662522, -0.62596349, -0.46886997, -0.43195321],
       [-0.39172056, -0.75420507, -0.61813217,  0.73199078, -0.43195321],
       [-0.74252565, -0.58032118, -0.59939112, -0.46886997, -0.43195321],
       [-0.52984786,  0.82098506,  1.31844616, -0.46886997, -0.43195321],
       [-0.75226924, -0.67038781, -0.54044193, -0.46886997, -0.43195321],
       [-0.51722886, -0.15861235,  0.53360817, -0.46886997, -0.43195321],
       [ 0.94307429,  1.63510561,  0.0502649 , -0.46886997,  2.33722904],
       [ 2.27493734,  0.10828435, -0.43806527,  0.81281795, -0.43195321],
       [-0.62666612, -0.76651942, -0.62548989, -0.46886997, -0.43195321],
       [-0.66600439, -0.34358661, -0.52269601, -0.46886997, -0.43195321],
       [ 2.40153737,  0.79677218,  0.14893452, -0.46886997,  0.18342062],
       [ 0.09391613,  1.61452938,  2.56423569, -0.46886997, -0.43195321],
       [-0.70296186, -0.26446128, -0.52736154, -0.46886997, -0.43195321],
       [ 0.44208927,  4.18123112,  3.45974963, -0.46886997, -0.43195321],
       [-0.78308892, -1.01087642, -0.63087146, -0.46886997,  0.89725427],
       [-0.54691067,  0.98156598,  0.70367308, -0.46886997, -0.43195321],
       [ 0.60991632, -0.09784872, -0.51965707, -0.46886997, -0.43195321],
       [-0.76052643, -0.96000271, -0.62467519, -0.46886997,  0.33726408],
       [ 1.93383484,  0.09531485, -0.29395536,  4.04373958, -0.43195321],
       [-0.76653353, -0.6801267 , -0.54957002, -0.46886997, -0.43195321],
       [-0.83650183,  2.64457877,  2.29128386, -0.46886997, -0.43195321],
       [-0.51706228,  1.07292328,  2.54926935, -0.46886997, -0.43195321],
       [-0.39173081,  1.0465606 ,  2.24162345, -0.46886997, -0.43195321],
       [ 0.54861269, -0.32386364, -0.44453781, -0.46886997, -0.43195321],
       [-0.41346552, -0.66397367, -0.62655268, -0.46886997, -0.43195321],
       [-0.66187067, -0.05643144,  0.1018028 ,  1.19732432, -0.43195321],
       [-0.61096157, -0.57797129, -0.5712147 , -0.46886997, -0.43195321],
       [ 0.34407676,  0.4601909 ,  0.05937608, -0.46886997,  0.18342062],
       [-0.46490255, -0.97695015, -0.62658651,  1.7874348 , -0.43195321]])
num_trans.shape
(88, 5)
X
province general year gdp fdi rnr rr i fr reg it
4 Anhui 32100.0 2000 2902.09 31847 0.000000 0.000000 0.000000 1601508 East China 1499110
6 Anhui 66529.0 2002 3519.72 38375 0.000000 0.000000 0.000000 1677840 East China 2404936
7 Anhui 52108.0 2003 3923.11 36720 0.000000 0.000000 0.000000 1896479 East China 2815820
10 Anhui 279052.0 2006 6112.50 139354 0.000000 0.000000 0.324324 3434548 East China 5167300
11 Anhui 178705.0 2007 7360.92 299892 0.000000 0.000000 0.324324 4468640 East China 7040099
... ... ... ... ... ... ... ... ... ... ... ...
347 Yunnan 564400.0 2007 4772.52 39453 0.000000 0.000000 0.000000 4867146 Southwest China 6832541
354 Zhejiang 321686.0 2002 8003.67 307610 0.000000 0.000000 0.000000 4958329 East China 1962633
355 Zhejiang 260313.0 2003 9705.02 498055 1.214286 0.035714 0.035714 6217715 East China 2261631
358 Zhejiang 394795.0 2006 15718.47 888935 1.214286 0.035714 0.035714 11537149 East China 2553268
359 Zhejiang 0.0 2007 18753.73 1036576 0.047619 0.000000 0.000000 16494981 East China 2939778

118 rows × 11 columns

Categorical preprocessing

categorical_features = ['province', 'reg']
cat_prep = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='sk_missing')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])
cat_prep
Pipeline(steps=[('imputer',
                 SimpleImputer(fill_value='sk_missing', strategy='constant')),
                ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))])
SimpleImputer(fill_value='sk_missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)

Running only on the categorical features

cat_trans = cat_prep.fit_transform(X_train[categorical_features])
cat_trans
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])
cat_trans.shape
(88, 33)

ColumnTransformer!

ct = ColumnTransformer([
   ('numerical', num_prep, numerical_features),
   ('categorical', cat_prep, categorical_features)
])
ct
ColumnTransformer(transformers=[('numerical',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['general', 'gdp', 'fdi', 'i', 'rr']),
                                ('categorical',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='sk_missing',
                                                                strategy='constant')),
                                                 ('ohe',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 ['province', 'reg'])])
['general', 'gdp', 'fdi', 'i', 'rr']
SimpleImputer()
StandardScaler()
['province', 'reg']
SimpleImputer(fill_value='sk_missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
X_trans = ct.fit_transform(X_train)
X_trans[:, :5]
array([[ 0.84445696, -0.23824571,  0.05133332, -0.46886997,  1.84019633],
       [ 0.24766853,  2.07395803,  1.57444985, -0.46886997, -0.43195321],
       [-0.72214407, -0.0446055 , -0.43939868, -0.46886997, -0.43195321],
       [-0.48856958, -0.11992158, -0.34379901, -0.46886997, -0.43195321],
       [ 0.85337791, -0.87758205, -0.54348651, -0.46886997, -0.43195321],
       [-0.62992082, -0.83196322, -0.62382665, -0.46886997,  0.17004293],
       [-0.83650183,  3.23074054,  4.28312405,  1.06326271, -0.43195321],
       [ 1.76828095, -0.19541419, -0.41644878, -0.46886997, -0.43195321],
       [ 1.78635605,  0.89747804,  0.79717537, -0.46886997,  3.97033652],
       [-0.01481382, -0.57518987, -0.60442312,  3.47737968, -0.43195321],
       [-0.33025804, -0.71766019, -0.62552372, -0.46886997, -0.43195321],
       [-0.75781248, -0.70583032, -0.54596728, -0.46886997, -0.43195321],
       [ 0.18859962, -0.10301925, -0.50475274, -0.46886997,  1.78339259],
       [ 3.61705142,  1.91072514,  0.2322093 , -0.46886997, -0.43195321],
       [-0.52047074, -0.52639929, -0.53731279, -0.46886997, -0.43195321],
       [-0.35308192, -0.41382042, -0.15623907,  2.47473995, -0.43195321],
       [ 0.51483816, -0.85369871, -0.61664935, -0.46886997, -0.43195321],
       [ 2.42485843, -0.08843151,  0.16030939, -0.46886997,  1.84019633],
       [-0.63055638, -0.3205428 , -0.53076977, -0.46886997, -0.43195321],
       [-0.81570508, -0.2076344 , -0.22869714,  0.25314756, -0.43195321],
       [-0.06101258, -0.78797637, -0.31514053, -0.46886997, -0.43195321],
       [-0.022315  ,  0.78012485,  0.51382125, -0.46886997, -0.43195321],
       [-0.37712312, -0.55331904, -0.3258642 ,  2.14150109, -0.43195321],
       [ 2.33926246, -0.46826211, -0.59521328,  3.47737968, -0.43195321],
       [ 0.62612061, -0.10351944, -0.37014586, -0.46886997, -0.43195321],
       [-0.1455937 , -0.05181405, -0.01314966, -0.46886997,  3.97033652],
       [-0.69280823, -0.04283426, -0.47187415, -0.46886997, -0.43195321],
       [-0.15946076, -0.25170559, -0.5456459 , -0.46886997, -0.43195321],
       [-0.80637666, -0.98226192, -0.59981116, -0.46886997, -0.43195321],
       [-0.45208878, -0.48061962, -0.51288289, -0.46886997,  1.78339259],
       [ 0.4867555 , -0.58727866, -0.62254962,  0.24317887, -0.43195321],
       [-0.74721037, -0.26346288, -0.50770147, -0.46886997, -0.26580227],
       [ 0.71498921, -0.96701118, -0.62406064,  0.76534802, -0.43195321],
       [-0.6776111 , -0.6259812 , -0.48297275,  1.75272242, -0.43195321],
       [ 0.20388901,  0.55832718,  0.6523297 , -0.46886997,  3.97033652],
       [-0.68462536, -0.15766887,  0.45117911, -0.46886997,  1.22955614],
       [-0.68130147, -0.59197843, -0.52937716, -0.46886997, -0.43195321],
       [-0.1213603 ,  0.16498962, -0.23803102,  1.33242116, -0.43195321],
       [-0.68649617, -0.8274596 , -0.61330032, -0.46886997,  0.42010287],
       [-0.73244378, -0.41583293, -0.54606312, -0.46886997, -0.43195321],
       [-0.75418874, -0.67194329, -0.56752738, -0.46886997, -0.43195321],
       [-0.11822093, -0.18759169, -0.14469223,  2.47473995, -0.43195321],
       [-0.5606009 , -0.89159703, -0.62039587, -0.46886997, -0.43195321],
       [-0.70811556, -0.99767742, -0.6295606 ,  1.33242116, -0.43195321],
       [-0.44276036, -0.47392695, -0.57065935, -0.46886997, -0.43195321],
       [-0.61544895, -0.21965453, -0.37710046, -0.46886997, -0.43195321],
       [-0.57791999, -0.10080471, -0.18860741, -0.46886997,  0.18342062],
       [ 2.23213936, -0.34291185, -0.59568406, -0.46886997, -0.43195321],
       [ 2.06354861,  1.39098988, -0.11068883, -0.46886997, -0.43195321],
       [-0.79190991, -0.9574449 , -0.62376745,  0.24317887, -0.43195321],
       [ 0.26443406,  2.61416753,  3.08558822,  1.06326271, -0.43195321],
       [-0.01209987,  0.53594439,  0.23629128, -0.46886997, -0.43195321],
       [-0.80634591, -0.96716418, -0.61755426, -0.46886997, -0.43195321],
       [-0.16938374,  0.8696658 ,  0.77316551, -0.27051351, -0.23415447],
       [-0.04311944,  0.18446741, -0.14938031,  0.73199078, -0.43195321],
       [-0.68041476,  0.47595163,  1.08241827, -0.18405044, -0.43195321],
       [-0.76631569, -0.33855142, -0.36484886, -0.46886997,  1.06490206],
       [-0.71942755, -1.0021987 , -0.63005112, -0.46886997,  1.28493979],
       [ 2.56991029,  0.81760341,  0.29109646, -0.46886997,  1.99108126],
       [-0.74181833, -0.94662522, -0.62596349, -0.46886997, -0.43195321],
       [-0.39172056, -0.75420507, -0.61813217,  0.73199078, -0.43195321],
       [-0.74252565, -0.58032118, -0.59939112, -0.46886997, -0.43195321],
       [-0.52984786,  0.82098506,  1.31844616, -0.46886997, -0.43195321],
       [-0.75226924, -0.67038781, -0.54044193, -0.46886997, -0.43195321],
       [-0.51722886, -0.15861235,  0.53360817, -0.46886997, -0.43195321],
       [ 0.94307429,  1.63510561,  0.0502649 , -0.46886997,  2.33722904],
       [ 2.27493734,  0.10828435, -0.43806527,  0.81281795, -0.43195321],
       [-0.62666612, -0.76651942, -0.62548989, -0.46886997, -0.43195321],
       [-0.66600439, -0.34358661, -0.52269601, -0.46886997, -0.43195321],
       [ 2.40153737,  0.79677218,  0.14893452, -0.46886997,  0.18342062],
       [ 0.09391613,  1.61452938,  2.56423569, -0.46886997, -0.43195321],
       [-0.70296186, -0.26446128, -0.52736154, -0.46886997, -0.43195321],
       [ 0.44208927,  4.18123112,  3.45974963, -0.46886997, -0.43195321],
       [-0.78308892, -1.01087642, -0.63087146, -0.46886997,  0.89725427],
       [-0.54691067,  0.98156598,  0.70367308, -0.46886997, -0.43195321],
       [ 0.60991632, -0.09784872, -0.51965707, -0.46886997, -0.43195321],
       [-0.76052643, -0.96000271, -0.62467519, -0.46886997,  0.33726408],
       [ 1.93383484,  0.09531485, -0.29395536,  4.04373958, -0.43195321],
       [-0.76653353, -0.6801267 , -0.54957002, -0.46886997, -0.43195321],
       [-0.83650183,  2.64457877,  2.29128386, -0.46886997, -0.43195321],
       [-0.51706228,  1.07292328,  2.54926935, -0.46886997, -0.43195321],
       [-0.39173081,  1.0465606 ,  2.24162345, -0.46886997, -0.43195321],
       [ 0.54861269, -0.32386364, -0.44453781, -0.46886997, -0.43195321],
       [-0.41346552, -0.66397367, -0.62655268, -0.46886997, -0.43195321],
       [-0.66187067, -0.05643144,  0.1018028 ,  1.19732432, -0.43195321],
       [-0.61096157, -0.57797129, -0.5712147 , -0.46886997, -0.43195321],
       [ 0.34407676,  0.4601909 ,  0.05937608, -0.46886997,  0.18342062],
       [-0.46490255, -0.97695015, -0.62658651,  1.7874348 , -0.43195321]])
X_trans[:, 5:]
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])
X_trans.shape
(88, 38)

Linear model

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
lin_reg = Pipeline([
    ('preprocess', ct),
    ('lin_reg', LinearRegression())
])
lin_reg
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['general', 'gdp', 'fdi', 'i',
                                                   'rr']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='sk_missing',
                                                                                 strategy='constant')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['province', 'reg'])])),
                ('lin_reg', LinearRegression())])
ColumnTransformer(transformers=[('numerical',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['general', 'gdp', 'fdi', 'i', 'rr']),
                                ('categorical',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='sk_missing',
                                                                strategy='constant')),
                                                 ('ohe',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 ['province', 'reg'])])
['general', 'gdp', 'fdi', 'i', 'rr']
SimpleImputer()
StandardScaler()
['province', 'reg']
SimpleImputer(fill_value='sk_missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
LinearRegression()
lin_reg.fit(X_train, y_train)
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['general', 'gdp', 'fdi', 'i',
                                                   'rr']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='sk_missing',
                                                                                 strategy='constant')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['province', 'reg'])])),
                ('lin_reg', LinearRegression())])
ColumnTransformer(transformers=[('numerical',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['general', 'gdp', 'fdi', 'i', 'rr']),
                                ('categorical',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='sk_missing',
                                                                strategy='constant')),
                                                 ('ohe',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 ['province', 'reg'])])
['general', 'gdp', 'fdi', 'i', 'rr']
SimpleImputer()
StandardScaler()
['province', 'reg']
SimpleImputer(fill_value='sk_missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
LinearRegression()
lin_reg.score(X_train, y_train)
0.9195619190476331

Random Forest

from sklearn.ensemble import RandomForestRegressor
rf = Pipeline([
    ('preprocess', ct),
    ('log_reg', RandomForestRegressor(random_state=42))
])
rf
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['general', 'gdp', 'fdi', 'i',
                                                   'rr']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='sk_missing',
                                                                                 strategy='constant')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['province', 'reg'])])),
                ('log_reg', RandomForestRegressor(random_state=42))])
ColumnTransformer(transformers=[('numerical',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['general', 'gdp', 'fdi', 'i', 'rr']),
                                ('categorical',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='sk_missing',
                                                                strategy='constant')),
                                                 ('ohe',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 ['province', 'reg'])])
['general', 'gdp', 'fdi', 'i', 'rr']
SimpleImputer()
StandardScaler()
['province', 'reg']
SimpleImputer(fill_value='sk_missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['general', 'gdp', 'fdi', 'i',
                                                   'rr']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='sk_missing',
                                                                                 strategy='constant')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['province', 'reg'])])),
                ('log_reg', RandomForestRegressor(random_state=42))])
ColumnTransformer(transformers=[('numerical',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 ['general', 'gdp', 'fdi', 'i', 'rr']),
                                ('categorical',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(fill_value='sk_missing',
                                                                strategy='constant')),
                                                 ('ohe',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False))]),
                                 ['province', 'reg'])])
['general', 'gdp', 'fdi', 'i', 'rr']
SimpleImputer()
StandardScaler()
['province', 'reg']
SimpleImputer(fill_value='sk_missing', strategy='constant')
OneHotEncoder(handle_unknown='ignore', sparse=False)
RandomForestRegressor(random_state=42)
rf.score(X_train, y_train)
0.9503718502648469