from
sklearn.model_selection
import
train_test_split
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.naive_bayes
import
MultinomialNB
from
sklearn.metrics
import
confusion_matrix, classification_report
import
numpy as np
import
nltk
from
nltk.corpus
import
stopwords
from
nltk.stem
import
WordNetLemmatizer
import
csv
def
get_wordnet_pos(treebank_tag):
if
treebank_tag.startswith(
'J'
):
return
nltk.corpus.wordnet.ADJ
elif
treebank_tag.startswith(
'V'
):
return
nltk.corpus.wordnet.VERB
elif
treebank_tag.startswith(
'N'
):
return
nltk.corpus.wordnet.NOUN
elif
treebank_tag.startswith(
'R'
):
return
nltk.corpus.wordnet.ADV
else
:
return
nltk.corpus.wordnet.NOUN
def
preprocessing(text):
tokens
=
[word
for
sent
in
nltk.sent_tokenize(text)
for
word
in
nltk.word_tokenize(sent)]
stops
=
stopwords.words(
'english'
)
tokens
=
[token
for
token
in
tokens
if
token
not
in
stops]
tokens
=
[token.lower()
for
token
in
tokens
if
len
(token) >
=
3
]
lmtzr
=
WordNetLemmatizer()
tag
=
nltk.pos_tag(tokens)
tokens
=
[lmtzr.lemmatize(token, pos
=
get_wordnet_pos(tag[i][
1
]))
for
i, token
in
enumerate
(tokens)]
preprocessed_text
=
' '
.join(tokens)
return
preprocessed_text
def
read_dataset():
file_path
=
r
'SMSSpamCollection'
sms
=
open
(file_path, encoding
=
'utf-8'
)
sms_label
=
[]
sms_data
=
[]
csv_reader
=
csv.reader(sms, delimiter
=
'\t'
)
for
line
in
csv_reader:
sms_label.append(line[
0
])
sms_data.append(preprocessing(line[
1
]))
sms.close()
return
sms_data, sms_label
def
split_dataset(data, label):
x_train, x_test, y_train, y_test
=
train_test_split(data, label, test_size
=
0.2
, random_state
=
0
, stratify
=
label)
return
x_train, x_test, y_train, y_test
def
tfidf_dataset(x_train,x_test):
tfidf
=
TfidfVectorizer()
X_train
=
tfidf.fit_transform(x_train)
X_test
=
tfidf.transform(x_test)
return
X_train, X_test, tfidf
def
revert_mail(x_train, X_train, model):
s
=
X_train.toarray()[
0
]
print
(
"第一封邮件向量表示为:"
, s)
a
=
np.flatnonzero(X_train.toarray()[
0
])
print
(
"非零元素的位置:"
, a)
print
(
"向量的非零元素的值:"
, s[a])
b
=
model.vocabulary_
key_list
=
[]
for
key, value
in
b.items():
if
value
in
a:
key_list.append(key)
print
(
"向量非零元素对应的单词:"
, key_list)
print
(
"向量化之前的邮件:"
, x_train[
0
])
def
mnb_model(x_train, x_test, y_train, y_test):
mnb
=
MultinomialNB()
mnb.fit(x_train, y_train)
ypre_mnb
=
mnb.predict(x_test)
print
(
"总数:"
,
len
(y_test))
print
(
"预测正确数:"
, (ypre_mnb
=
=
y_test).
sum
())
return
ypre_mnb
def
class_report(ypre_mnb, y_test):
conf_matrix
=
confusion_matrix(y_test, ypre_mnb)
print
(
"混淆矩阵:\n"
, conf_matrix)
c
=
classification_report(y_test, ypre_mnb)
print
(
"------------------------------------------"
)
print
(
"分类报告:\n"
, c)
print
(
"模型准确率:"
, (conf_matrix[
0
][
0
]
+
conf_matrix[
1
][
1
])
/
np.
sum
(conf_matrix))
if
__name__
=
=
'__main__'
:
sms_data, sms_label
=
read_dataset()
x_train, x_test, y_train, y_test
=
split_dataset(sms_data, sms_label)
X_train, X_test,tfidf
=
tfidf_dataset(x_train, x_test)
revert_mail(x_train, X_train, tfidf)
y_mnb
=
mnb_model(X_train, X_test, y_train,y_test)
class_report(y_mnb, y_test)