NLP Example bằng tiếng Việt using StackNetClassifier
Credit: Code and Notebooks from https://github.com/ngxbac/aivivn_phanloaisacthaibinhluan
import pandas as pd
import numpy as np
from scipy.sparse import hstack, csr_matrix, vstack
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.ensemble import *
from sklearn.linear_model import *
from tqdm import *
import wordcloud
import matplotlib.pyplot as plt
import gc
import lightgbm as lgb
%matplotlib inline
# Load data
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")
train_df.head()
test_df.head()
df = pd.concat([train_df, test_df], axis=0)
# del train_df, test_df
# gc.collect()
import emoji
def extract_emojis(str):
return [c for c in str if c in emoji.UNICODE_EMOJI]
good_df = train_df[train_df['label'] == 0]
good_comment = good_df['comment'].values
good_emoji = []
for c in good_comment:
good_emoji += extract_emojis(c)
good_emoji = np.unique(np.asarray(good_emoji))
bad_df = train_df[train_df['label'] == 1]
bad_comment = bad_df['comment'].values
bad_emoji = []
for c in bad_comment:
bad_emoji += extract_emojis(c)
bad_emoji = np.unique(np.asarray(bad_emoji))
good_emoji
# Just remove "sad, bad" emoji :D
good_emoji_fix = [
'↖', '↗', '☀', '☺', '♀', '♥', '✌', '✨', '❣', '❤', '⭐', '🆗',
'🌝', '🌟', '🌧', '🌷', '🌸', '🌺', '🌼', '🍓', '🎈', '🎉', '🐅', '🐾', '👉',
'👌', '👍', '👏', '💋', '💌', '💐', '💓', '💕', '💖', '💗', '💙', '💚', '💛',
'💜', '💞', '💟', '💥', '💪', '💮', '💯', '💰', '📑', '🖤', '😀', '😁', '😂',
'😃', '😄', '😅', '😆', '😇', '😉', '😊', '😋', '😌', '😍', '😎', '😑', '😓', '😔',
'😖', '😗', '😘', '😙', '😚', '😛', '😜', '😝', '😞', '😟', '😡', '😯', '😰', '😱', '😲', '😳', '😻', '🙂', '🙃', '🙄', '🙆', '🙌', '🤑', '🤔', '🤗',
]
bad_emoji
# Just remove "good" emoji :D
bad_emoji_fix = [
'☹', '✋', '❌', '❓', '👎', '👶', '💀',
'😐', '😑', '😒', '😓', '😔',
'😞', '😟', '😠', '😡', '😢', '😣', '😤', '😥', '😧', '😩', '😪', '😫', '😬',
'😭', '😳', '😵', '😶', '🙁', '🙄', '🤔',
]
def count_good_bad_emoji(row):
comment = row['comment']
n_good_emoji = 0
n_bad_emoji = 0
for c in comment:
if c in good_emoji_fix:
n_good_emoji += 1
if c in bad_emoji_fix:
n_bad_emoji += 1
row['n_good_emoji'] = n_good_emoji
row['n_bad_emoji'] = n_bad_emoji
return row
# Some features
df['comment'] = df['comment'].astype(str).fillna(' ')
df['comment'] = df['comment'].str.lower()
df['num_words'] = df['comment'].apply(lambda s: len(s.split()))
df['num_unique_words'] = df['comment'].apply(lambda s: len(set(w for w in s.split())))
df['words_vs_unique'] = df['num_unique_words'] / df['num_words'] * 100
df = df.apply(count_good_bad_emoji, axis=1)
df['good_bad_emoji_ratio'] = df['n_good_emoji'] / df['n_bad_emoji']
df['good_bad_emoji_ratio'] = df['good_bad_emoji_ratio'].replace(np.nan, 0)
df['good_bad_emoji_ratio'] = df['good_bad_emoji_ratio'].replace(np.inf, 99)
df['good_bad_emoji_diff'] = df['n_good_emoji'] - df['n_bad_emoji']
df['good_bad_emoji_sum'] = df['n_good_emoji'] + df['n_bad_emoji']
train_df = df[~df['label'].isnull()]
test_df = df[df['label'].isnull()]
train_comments = train_df['comment'].fillna("none").values
test_comments = test_df['comment'].fillna("none").values
y_train = train_df['label'].values
train_df.head()
Tạo feature TFIDF đơn giản
tfidf = TfidfVectorizer(
min_df = 5,
max_df = 0.8,
max_features=10000,
sublinear_tf=True
)
X_train_tfidf = tfidf.fit_transform(train_comments)
X_test_tfidf = tfidf.transform(test_comments)
EXCLUED_COLS = ['id', 'comment', 'label']
static_cols = [c for c in train_df.columns if not c in EXCLUED_COLS]
X_train_static = train_df[static_cols].values
X_test_static = test_df[static_cols].values
X_train = hstack([X_train_tfidf, csr_matrix(X_train_static)]).tocsr()
X_test = hstack([X_test_tfidf, csr_matrix(X_test_static)]).tocsr()
# X_train = X_train_tfidf
# X_test = X_test_tfidf
X_train.shape, X_test.shape, y_train.shape
models=[
######## First level ########
[
RandomForestClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1),
ExtraTreesClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1),
GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1),
LogisticRegression(random_state=1)
],
######## Second level ########
[
RandomForestClassifier (n_estimators=200, criterion="entropy", max_depth=5, max_features=0.5, random_state=1)
]
]
from pystacknet.pystacknet import StackNetClassifier
model = StackNetClassifier(
models, metric="f1",
folds=5,
restacking=False,
use_retraining=True,
use_proba=True,
random_state=12345, n_jobs=1, verbose=1
)
model.fit(X_train, y_train)
preds=model.predict_proba(X_test)
pred_cls = np.argmax(preds, axis=1)
# submission = pd.read_csv("./data/sample_submission.csv")
# submission['label'] = pred_cls
# submission.head()
# submission.to_csv("stack_demo.csv", index=False)
from sklearn.model_selection import cross_val_predict
models = [
RandomForestClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1),
ExtraTreesClassifier (n_estimators=100, criterion="entropy", max_depth=5, max_features=0.5, random_state=1),
GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, max_features=0.5, random_state=1),
LogisticRegression(random_state=1)
]
def cross_val_and_predict(clf, X, y, X_test, nfolds):
kf = StratifiedKFold(n_splits=nfolds, shuffle=True, random_state=42)
oof_preds = np.zeros((X.shape[0], 2))
sub_preds = np.zeros((X_test.shape[0], 2))
for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
X_train, y_train = X[train_idx], y[train_idx]
X_valid, y_valid = X[valid_idx], y[valid_idx]
clf.fit(X_train, y_train)
oof_preds[valid_idx] = clf.predict_proba(X_valid)
sub_preds += clf.predict_proba(X_test) / kf.n_splits
return oof_preds, sub_preds
sub_preds = []
for clf in models:
oof_pred, sub_pred = cross_val_and_predict(clf, X_train, y_train, X_test, nfolds=5)
oof_pred_cls = oof_pred.argmax(axis=1)
oof_f1 = f1_score(y_pred=oof_pred_cls, y_true=y_train)
print(clf.__class__)
print(f"F1 CV: {oof_f1}")
sub_preds.append(sub_pred)
sub_preds = np.asarray(sub_preds)
sub_preds = sub_preds.mean(axis=0)
sub_pred_cls = sub_preds.argmax(axis=1)
# submission_ensemble = submission.copy()
# submission_ensemble['label'] = sub_pred_cls
# submission_ensemble.to_csv("ensemble.csv", index=False)
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import wordcloud
import matplotlib.pyplot as plt
import gc
import lightgbm as lgb
%matplotlib inline
# Load data
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")
train_df.head()
test_df.head()
train_comments = train_df['comment'].fillna("none").values
test_comments = test_df['comment'].fillna("none").values
y_train = train_df['label'].values
# Wordcloud of training set
cloud = np.array(train_comments).flatten()
plt.figure(figsize=(20,10))
word_cloud = wordcloud.WordCloud(
max_words=200,background_color ="black",
width=2000,height=1000,mode="RGB"
).generate(str(cloud))
plt.axis("off")
plt.imshow(word_cloud)
# Wordcloud of test set
cloud = np.array(test_comments).flatten()
plt.figure(figsize=(20,10))
word_cloud = wordcloud.WordCloud(
max_words=100,background_color ="black",
width=2000,height=1000,mode="RGB"
).generate(str(cloud))
plt.axis("off")
plt.imshow(word_cloud)
tfidf = TfidfVectorizer(
min_df=5,
max_df= 0.8,
max_features=10000,
sublinear_tf=True
)
X_train = tfidf.fit_transform(train_comments)
X_test = tfidf.transform(test_comments)
X_train.shape, X_test.shape, y_train.shape
def lgb_f1_score(y_hat, data):
y_true = data.get_label()
y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
return 'f1', f1_score(y_true, y_hat), True
print("Starting LightGBM. Train shape: {}, test shape: {}".format(X_train.shape, X_test.shape))
# Cross validation model
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
# Create arrays and dataframes to store results
oof_preds = np.zeros(X_train.shape[0])
sub_preds = np.zeros(X_test.shape[0])
# k-fold
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_train, y_train)):
print("Fold %s" % (n_fold))
train_x, train_y = X_train[train_idx], y_train[train_idx]
valid_x, valid_y = X_train[valid_idx], y_train[valid_idx]
# set data structure
lgb_train = lgb.Dataset(train_x,
label=train_y,
free_raw_data=False)
lgb_test = lgb.Dataset(valid_x,
label=valid_y,
free_raw_data=False)
params = {
'objective' :'binary',
'learning_rate' : 0.01,
'num_leaves' : 76,
'feature_fraction': 0.64,
'bagging_fraction': 0.8,
'bagging_freq':1,
'boosting_type' : 'gbdt',
}
reg = lgb.train(
params,
lgb_train,
valid_sets=[lgb_train, lgb_test],
valid_names=['train', 'valid'],
num_boost_round=10000,
verbose_eval=100,
early_stopping_rounds=100,
feval=lgb_f1_score
)
oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
sub_preds += reg.predict(X_test, num_iteration=reg.best_iteration) / folds.n_splits
del reg, train_x, train_y, valid_x, valid_y
gc.collect()
threshold = 0.5
preds = (sub_preds > threshold).astype(np.uint8)