import csv import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split, cross_val_score from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc from sklearn.utils import shuffle from sklearn.model_selection import learning_curve import gender_guesser.detector as gender def read_datasets(): """ Reads users profile from csv files """ genuine_users = pd.read_csv("data/users.csv") fake_users = pd.read_csv("data/fusers.csv") x = pd.concat([genuine_users, fake_users]) y = [1] * len(genuine_users) + [0] * len(fake_users) return x, y def predict_sex(names): sex_predictor = gender.Detector(case_sensitive=False) sex_code = [] for name in names: first_name = name.split(' ')[0] sex = sex_predictor.get_gender(first_name) if sex == 'female': sex_code.append(2) # elif sex == 'mostly_female': # sex_code.append(-1) elif sex == 'male': sex_code.append(1) # elif sex == 'mostly_male': # sex_code.append(1) else: sex_code.append(0) # Assign a default value for unknown genders return sex_code def extract_features(x): x['sex_code'] = predict_sex(x['name']) feature_columns_to_use = ['statuses_count', 'followers_count', 'friends_count', 'favourites_count', 'listed_count', 'sex_code'] x = x[feature_columns_to_use] return x # Rest of your code... def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)): plt.figure() plt.title(title) if ylim is not None: plt.ylim(*ylim) plt.xlabel("Training examples") plt.ylabel("Score") train_sizes, train_scores, test_scores = learning_curve( estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.grid() plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r") plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g") plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score") plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score") plt.legend(loc="best") return plt def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues): target_names=['Fake','Genuine'] plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(target_names)) plt.xticks(tick_marks, target_names, rotation=45) plt.yticks(tick_marks, target_names) plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') def plot_roc_curve(y_test, y_pred): false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred) print("False Positive rate: ", false_positive_rate) print("True Positive rate: ", true_positive_rate) roc_auc = auc(false_positive_rate, true_positive_rate) plt.title('Receiver Operating Characteristic') plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([-0.1, 1.2]) plt.ylim([-0.1, 1.2]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show() def train(X_train, y_train, X_test): """ Trains and predicts dataset with a Random Forest classifier """ clf = RandomForestClassifier(n_estimators=40, oob_score=True) clf.fit(X_train, y_train) print("The best classifier is: ", clf) # Estimate score scores = cross_val_score(clf, X_train, y_train, cv=5) print(scores) print('Estimated score: %0.5f (+/- %0.5f)' % (scores.mean(), scores.std() / 2)) title = 'Learning Curves (Random Forest)' plot_learning_curve(clf, title, X_train, y_train, cv=5) plt.show() # Predict y_pred = clf.predict(X_test) import pickle with open('data.pkl','wb') as file: pickle.dump(clf,file) return y_test, y_pred print("Reading datasets...\n") x, y = read_datasets() x.describe() print("Extracting features...\n") x = extract_features(x) print(x.columns) print(x.describe()) print("Splitting datasets into train and test dataset...\n") X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=44) print("Training datasets...\n") y_test, y_pred = train(X_train, y_train, X_test) print('Classification Accuracy on Test dataset: ', accuracy_score(y_test, y_pred)) cm = confusion_matrix(y_test, y_pred) print('Confusion matrix, without normalization') print(cm) plot_confusion_matrix(cm) cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print('Normalized confusion matrix') print(cm_normalized) plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix') print(classification_report(y_test, y_pred, target_names=['Fake', 'Genuine'])) plot_roc_curve(y_test, y_pred)