After training a model on chunks, how can I save the final model?
df = pd.read_csv(, chunksize=10000)
for chunk in df:
text_clf.fit(X_train, y_train)
filename = 'finalized_model.sav'
joblib.dump(text_clf, filename)
# load the model from disk
loaded_model = joblib.load(filename)
Saving a model like this will just give me the model trained on the last chunk. How can I avoid that and get the overall model trained on every chunk?
UPDATE: SHARING FULL CODE AS REQUESTED
df = pd.read_csv("ExtractedData.csv", chunksize=6953)
for chunk in df:
chunk = chunk.dropna()
chunk = chunk.astype(str)
text = chunk['body']
label = chunk['user_id']
# remove url
text = remove_url(text)
mask_words = (text.str.split().str.len() > 50) & (text.str.split().str.len() < 2000)
text = text.loc[mask_words]
label = label.loc[mask_words]
mask_chars = (text.str.len() > 5) & (text.str.len() < 18000)
text = text.loc[mask_chars]
label = label.loc[mask_chars]
X_train, X_test, y_train, y_test = train_test_split(text, label, test_size=0.3,
shuffle=True, random_state=42)
text_clf = Pipeline([('vect', TfidfVectorizer(strip_accents='unicode', lowercase=True,
analyzer=lemmatize_text, ngram_range=(1,3))),
('tfidf', TfidfTransformer()),
('clf', XGBClassifier()),
])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
print(metrics.classification_report(y_test, predicted))
# calculate accuracy
accuracy_list.append(metrics.accuracy_score(y_test, predicted))
precision_list.append(metrics.precision_score(y_test, predicted, average='weighted'))
recall_list.append(metrics.recall_score(y_test, predicted, average='weighted'))
f1_list.append(metrics.f1_score(y_test, predicted, average='weighted'))
# save the model to disk
filename = 'finalized_model.sav'
joblib.dump(text_clf, filename)
# load the model from disk
loaded_model = joblib.load(filename)