43 KiB
43 KiB
None
<html lang="en">
<head>
</head>
</html>
In [1]:
import pandas as pd
In [5]:
data = pd.read_csv('./data/shill_bidding.csv',encoding='gbk')
data
Out[5]:
In [9]:
X = data.iloc[:, :-1] # 特征数据
X
Out[9]:
In [10]:
y = data.iloc[:, -1] # 标签数据
y
Out[10]:
In [11]:
from sklearn.model_selection import train_test_split
In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [14]:
print("训练集特征数量:", len(X_train))
print("测试集特征数量:", len(X_test))
print("训练集标签数量:", len(y_train))
print("测试集标签数量:", len(y_test))
In [16]:
from sklearn.decomposition import PCA
In [17]:
pca = PCA(n_components=0.999)
In [18]:
X_train_pca = pca.fit_transform(X_train)
In [19]:
X_test_pca = pca.transform(X_test)
In [20]:
print("降维后训练集大小:", X_train_pca.shape)
print("降维后测试集大小:", X_test_pca.shape)
In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
In [23]:
model = LogisticRegression()
model.fit(X_train_pca, y_train)
Out[23]:
In [24]:
y_pred = model.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
In [25]:
print(f"模型在测试集上的准确率: {accuracy * 100:.2f}%")
In [26]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
In [27]:
precision = precision_score(y_test, y_pred)
In [28]:
recall = recall_score(y_test, y_pred)
In [29]:
f1 = f1_score(y_test, y_pred)
In [30]:
print(f"精确率: {precision * 100:.2f}%")
print(f"召回率: {recall * 100:.2f}%")
print(f"F1 值: {f1 * 100:.2f}%")
In [31]:
cm = confusion_matrix(y_test, y_pred)
In [33]:
from matplotlib import font_manager as fm
import matplotlib as mpl
font_path = '/System/Library/Fonts/STHeiti Medium.ttc'
my_font = fm.FontProperties(fname=font_path)
mpl.rcParams['font.family'] = my_font.get_name()
mpl.rcParams['axes.unicode_minus'] = False
In [34]:
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
# annot=True 表示在热力图上显示具体数值,fmt='d' 表示以整数形式显示
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.title('混淆矩阵')
plt.show()
In [44]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
In [45]:
scaler = StandardScaler()
X_train_pca_scaled = scaler.fit_transform(X_train_pca)
X_test_pca_scaled = scaler.transform(X_test_pca)
In [46]:
param_grid = {
'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty': ['l1', 'l2']
}
In [47]:
model = LogisticRegression(solver='liblinear')
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1')
grid_search.fit(X_train_pca_scaled, y_train)
Out[47]:
In [48]:
print("最优超参数组合:", grid_search.best_params_)
print("最优模型在训练集上的 F1 值:", grid_search.best_score_)
In [51]:
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_pca_scaled)
In [52]:
precision_best = precision_score(y_test, y_pred_best)
recall_best = recall_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best)
accuracy_best = accuracy_score(y_test, y_pred_best)
In [53]:
print(f"最优模型在测试集上的准确率: {accuracy_best * 100:.2f}%")
print(f"最优模型在测试集上的精确率: {precision_best * 100:.2f}%")
print(f"最优模型在测试集上的召回率: {recall_best * 100:.2f}%")
print(f"最优模型在测试集上的 F1 值: {f1_best * 100:.2f}%")
In [ ]:
import numpy as np
from sklearn.metrics import classification_report