Project2
导入库¶
In [16]:
Copied!
import os
import pandas as pd
import numpy as np
import datetime
import random
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB
from time import time
import warnings
import seaborn as sns
warnings.filterwarnings("ignore")
import os
import pandas as pd
import numpy as np
import datetime
import random
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB,ComplementNB,BernoulliNB
from time import time
import warnings
import seaborn as sns
warnings.filterwarnings("ignore")
导入下载的20_newsgroups¶
但由于是文本,先把它处理成.csv
In [3]:
Copied!
path = '20_newsgroups/'
file_names = os.listdir(path)
print(file_names)
len(file_names)
path = '20_newsgroups/'
file_names = os.listdir(path)
print(file_names)
len(file_names)
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
Out[3]:
20
In [4]:
Copied!
target=[]
data=[]
for file_name in file_names:
path_ = '20_newsgroups/{}'.format(file_name)
file_names_ = os.listdir(path_)
for file in file_names_:
if file !='.ipynb_checkpoints':
path__='20_newsgroups/{}/{}'.format(file_name,file)
text = open(path__, encoding="utf8", errors='ignore')
text=text.read()
data.append(text)
target.append(file_name)
target=[]
data=[]
for file_name in file_names:
path_ = '20_newsgroups/{}'.format(file_name)
file_names_ = os.listdir(path_)
for file in file_names_:
if file !='.ipynb_checkpoints':
path__='20_newsgroups/{}/{}'.format(file_name,file)
text = open(path__, encoding="utf8", errors='ignore')
text=text.read()
data.append(text)
target.append(file_name)
In [5]:
Copied!
df = pd.DataFrame({'data': data,'target': target}, columns=['data', 'target'])
df.to_csv('./20_newsgroups.csv')
df = pd.DataFrame({'data': data,'target': target}, columns=['data', 'target'])
df.to_csv('./20_newsgroups.csv')
探索文本数据¶
In [6]:
Copied!
df=pd.read_csv('20_newsgroups.csv')
df
df=pd.read_csv('20_newsgroups.csv')
df
Out[6]:
| Unnamed: 0 | data | target | |
|---|---|---|---|
| 0 | 0 | Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49... | alt.atheism |
| 1 | 1 | Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51... | alt.atheism |
| 2 | 2 | Newsgroups: alt.atheism\nPath: cantaloupe.srv.... | alt.atheism |
| 3 | 3 | Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51... | alt.atheism |
| 4 | 4 | Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:51... | alt.atheism |
| ... | ... | ... | ... |
| 19992 | 19992 | Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:54... | talk.religion.misc |
| 19993 | 19993 | Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:54... | talk.religion.misc |
| 19994 | 19994 | Xref: cantaloupe.srv.cs.cmu.edu talk.religion.... | talk.religion.misc |
| 19995 | 19995 | Xref: cantaloupe.srv.cs.cmu.edu talk.religion.... | talk.religion.misc |
| 19996 | 19996 | Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:... | talk.religion.misc |
19997 rows × 3 columns
In [7]:
Copied!
df['data'][0]
df['data'][0]
Out[7]:
'Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:49960 alt.atheism.moderated:713 news.answers:7054 alt.answers:126\nPath: cantaloupe.srv.cs.cmu.edu!crabapple.srv.cs.cmu.edu!bb3.andrew.cmu.edu!news.sei.cmu.edu!cis.ohio-state.edu!magnus.acs.ohio-state.edu!usenet.ins.cwru.edu!agate!spool.mu.edu!uunet!pipex!ibmpcug!mantis!mathew\nFrom: mathew <mathew@mantis.co.uk>\nNewsgroups: alt.atheism,alt.atheism.moderated,news.answers,alt.answers\nSubject: Alt.Atheism FAQ: Atheist Resources\nSummary: Books, addresses, music -- anything related to atheism\nKeywords: FAQ, atheism, books, music, fiction, addresses, contacts\nMessage-ID: <19930329115719@mantis.co.uk>\nDate: Mon, 29 Mar 1993 11:57:19 GMT\nExpires: Thu, 29 Apr 1993 11:57:19 GMT\nFollowup-To: alt.atheism\nDistribution: world\nOrganization: Mantis Consultants, Cambridge. UK.\nApproved: news-answers-request@mit.edu\nSupersedes: <19930301143317@mantis.co.uk>\nLines: 290\n\nArchive-name: atheism/resources\nAlt-atheism-archive-name: resources\nLast-modified: 11 December 1992\nVersion: 1.0\n\n Atheist Resources\n\n Addresses of Atheist Organizations\n\n USA\n\nFREEDOM FROM RELIGION FOUNDATION\n\nDarwin fish bumper stickers and assorted other atheist paraphernalia are\navailable from the Freedom From Religion Foundation in the US.\n\nWrite to: FFRF, P.O. Box 750, Madison, WI 53701.\nTelephone: (608) 256-8900\n\nEVOLUTION DESIGNS\n\nEvolution Designs sell the "Darwin fish". It\'s a fish symbol, like the ones\nChristians stick on their cars, but with feet and the word "Darwin" written\ninside. The deluxe moulded 3D plastic fish is $4.95 postpaid in the US.\n\nWrite to: Evolution Designs, 7119 Laurel Canyon #4, North Hollywood,\n CA 91605.\n\nPeople in the San Francisco Bay area can get Darwin Fish from Lynn Gold --\ntry mailing <figmo@netcom.com>. For net people who go to Lynn directly, the\nprice is $4.95 per fish.\n\nAMERICAN ATHEIST PRESS\n\nAAP publish various atheist books -- critiques of the Bible, lists of\nBiblical contradictions, and so on. One such book is:\n\n"The Bible Handbook" by W.P. Ball and G.W. Foote. American Atheist Press.\n372 pp. ISBN 0-910309-26-4, 2nd edition, 1986. Bible contradictions,\nabsurdities, atrocities, immoralities... contains Ball, Foote: "The Bible\nContradicts Itself", AAP. Based on the King James version of the Bible.\n\nWrite to: American Atheist Press, P.O. Box 140195, Austin, TX 78714-0195.\n or: 7215 Cameron Road, Austin, TX 78752-2973.\nTelephone: (512) 458-1244\nFax: (512) 467-9525\n\nPROMETHEUS BOOKS\n\nSell books including Haught\'s "Holy Horrors" (see below).\n\nWrite to: 700 East Amherst Street, Buffalo, New York 14215.\nTelephone: (716) 837-2475.\n\nAn alternate address (which may be newer or older) is:\nPrometheus Books, 59 Glenn Drive, Buffalo, NY 14228-2197.\n\nAFRICAN-AMERICANS FOR HUMANISM\n\nAn organization promoting black secular humanism and uncovering the history of\nblack freethought. They publish a quarterly newsletter, AAH EXAMINER.\n\nWrite to: Norm R. Allen, Jr., African Americans for Humanism, P.O. Box 664,\n Buffalo, NY 14226.\n\n United Kingdom\n\nRationalist Press Association National Secular Society\n88 Islington High Street 702 Holloway Road\nLondon N1 8EW London N19 3NL\n071 226 7251 071 272 1266\n\nBritish Humanist Association South Place Ethical Society\n14 Lamb\'s Conduit Passage Conway Hall\nLondon WC1R 4RH Red Lion Square\n071 430 0908 London WC1R 4RL\nfax 071 430 1271 071 831 7723\n\nThe National Secular Society publish "The Freethinker", a monthly magazine\nfounded in 1881.\n\n Germany\n\nIBKA e.V.\nInternationaler Bund der Konfessionslosen und Atheisten\nPostfach 880, D-1000 Berlin 41. Germany.\n\nIBKA publish a journal:\nMIZ. (Materialien und Informationen zur Zeit. Politisches\nJournal der Konfessionslosesn und Atheisten. Hrsg. IBKA e.V.)\nMIZ-Vertrieb, Postfach 880, D-1000 Berlin 41. Germany.\n\nFor atheist books, write to:\n\nIBDK, Internationaler B"ucherdienst der Konfessionslosen\nPostfach 3005, D-3000 Hannover 1. Germany.\nTelephone: 0511/211216\n\n\n Books -- Fiction\n\nTHOMAS M. DISCH\n\n"The Santa Claus Compromise"\nShort story. The ultimate proof that Santa exists. All characters and \nevents are fictitious. Any similarity to living or dead gods -- uh, well...\n\nWALTER M. MILLER, JR\n\n"A Canticle for Leibowitz"\nOne gem in this post atomic doomsday novel is the monks who spent their lives\ncopying blueprints from "Saint Leibowitz", filling the sheets of paper with\nink and leaving white lines and letters.\n\nEDGAR PANGBORN\n\n"Davy"\nPost atomic doomsday novel set in clerical states. The church, for example,\nforbids that anyone "produce, describe or use any substance containing...\natoms". \n\nPHILIP K. DICK\n\nPhilip K. Dick Dick wrote many philosophical and thought-provoking short \nstories and novels. His stories are bizarre at times, but very approachable.\nHe wrote mainly SF, but he wrote about people, truth and religion rather than\ntechnology. Although he often believed that he had met some sort of God, he\nremained sceptical. Amongst his novels, the following are of some relevance:\n\n"Galactic Pot-Healer"\nA fallible alien deity summons a group of Earth craftsmen and women to a\nremote planet to raise a giant cathedral from beneath the oceans. When the\ndeity begins to demand faith from the earthers, pot-healer Joe Fernwright is\nunable to comply. A polished, ironic and amusing novel.\n\n"A Maze of Death"\nNoteworthy for its description of a technology-based religion.\n\n"VALIS"\nThe schizophrenic hero searches for the hidden mysteries of Gnostic\nChristianity after reality is fired into his brain by a pink laser beam of\nunknown but possibly divine origin. He is accompanied by his dogmatic and\ndismissively atheist friend and assorted other odd characters.\n\n"The Divine Invasion"\nGod invades Earth by making a young woman pregnant as she returns from\nanother star system. Unfortunately she is terminally ill, and must be\nassisted by a dead man whose brain is wired to 24-hour easy listening music.\n\nMARGARET ATWOOD\n\n"The Handmaid\'s Tale"\nA story based on the premise that the US Congress is mysteriously\nassassinated, and fundamentalists quickly take charge of the nation to set it\n"right" again. The book is the diary of a woman\'s life as she tries to live\nunder the new Christian theocracy. Women\'s right to own property is revoked,\nand their bank accounts are closed; sinful luxuries are outlawed, and the\nradio is only used for readings from the Bible. Crimes are punished\nretroactively: doctors who performed legal abortions in the "old world" are\nhunted down and hanged. Atwood\'s writing style is difficult to get used to\nat first, but the tale grows more and more chilling as it goes on.\n\nVARIOUS AUTHORS\n\n"The Bible"\nThis somewhat dull and rambling work has often been criticized. However, it\nis probably worth reading, if only so that you\'ll know what all the fuss is\nabout. It exists in many different versions, so make sure you get the one\ntrue version.\n\n Books -- Non-fiction\n\nPETER DE ROSA\n\n"Vicars of Christ", Bantam Press, 1988\nAlthough de Rosa seems to be Christian or even Catholic this is a very\nenlighting history of papal immoralities, adulteries, fallacies etc.\n(German translation: "Gottes erste Diener. Die dunkle Seite des Papsttums",\nDroemer-Knaur, 1989)\n\nMICHAEL MARTIN\n\n"Atheism: A Philosophical Justification", Temple University Press,\n Philadelphia, USA.\nA detailed and scholarly justification of atheism. Contains an outstanding\nappendix defining terminology and usage in this (necessarily) tendentious\narea. Argues both for "negative atheism" (i.e. the "non-belief in the\nexistence of god(s)") and also for "positive atheism" ("the belief in the\nnon-existence of god(s)"). Includes great refutations of the most\nchallenging arguments for god; particular attention is paid to refuting\ncontempory theists such as Platinga and Swinburne.\n541 pages. ISBN 0-87722-642-3 (hardcover; paperback also available)\n\n"The Case Against Christianity", Temple University Press\nA comprehensive critique of Christianity, in which he considers\nthe best contemporary defences of Christianity and (ultimately)\ndemonstrates that they are unsupportable and/or incoherent.\n273 pages. ISBN 0-87722-767-5\n\nJAMES TURNER\n\n"Without God, Without Creed", The Johns Hopkins University Press, Baltimore,\n MD, USA\nSubtitled "The Origins of Unbelief in America". Examines the way in which\nunbelief (whether agnostic or atheistic) became a mainstream alternative\nworld-view. Focusses on the period 1770-1900, and while considering France\nand Britain the emphasis is on American, and particularly New England\ndevelopments. "Neither a religious history of secularization or atheism,\nWithout God, Without Creed is, rather, the intellectual history of the fate\nof a single idea, the belief that God exists." \n316 pages. ISBN (hardcover) 0-8018-2494-X (paper) 0-8018-3407-4\n\nGEORGE SELDES (Editor)\n\n"The great thoughts", Ballantine Books, New York, USA\nA "dictionary of quotations" of a different kind, concentrating on statements\nand writings which, explicitly or implicitly, present the person\'s philosophy\nand world-view. Includes obscure (and often suppressed) opinions from many\npeople. For some popular observations, traces the way in which various\npeople expressed and twisted the idea over the centuries. Quite a number of\nthe quotations are derived from Cardiff\'s "What Great Men Think of Religion"\nand Noyes\' "Views of Religion".\n490 pages. ISBN (paper) 0-345-29887-X.\n\nRICHARD SWINBURNE\n\n"The Existence of God (Revised Edition)", Clarendon Paperbacks, Oxford\nThis book is the second volume in a trilogy that began with "The Coherence of\nTheism" (1977) and was concluded with "Faith and Reason" (1981). In this\nwork, Swinburne attempts to construct a series of inductive arguments for the\nexistence of God. His arguments, which are somewhat tendentious and rely\nupon the imputation of late 20th century western Christian values and\naesthetics to a God which is supposedly as simple as can be conceived, were\ndecisively rejected in Mackie\'s "The Miracle of Theism". In the revised\nedition of "The Existence of God", Swinburne includes an Appendix in which he\nmakes a somewhat incoherent attempt to rebut Mackie.\n\nJ. L. MACKIE\n\n"The Miracle of Theism", Oxford\nThis (posthumous) volume contains a comprehensive review of the principal\narguments for and against the existence of God. It ranges from the classical\nphilosophical positions of Descartes, Anselm, Berkeley, Hume et al, through\nthe moral arguments of Newman, Kant and Sidgwick, to the recent restatements\nof the classical theses by Plantinga and Swinburne. It also addresses those\npositions which push the concept of God beyond the realm of the rational,\nsuch as those of Kierkegaard, Kung and Philips, as well as "replacements for\nGod" such as Lelie\'s axiarchism. The book is a delight to read - less\nformalistic and better written than Martin\'s works, and refreshingly direct\nwhen compared with the hand-waving of Swinburne.\n\nJAMES A. HAUGHT\n\n"Holy Horrors: An Illustrated History of Religious Murder and Madness",\n Prometheus Books\nLooks at religious persecution from ancient times to the present day -- and\nnot only by Christians.\nLibrary of Congress Catalog Card Number 89-64079. 1990.\n\nNORM R. ALLEN, JR.\n\n"African American Humanism: an Anthology"\nSee the listing for African Americans for Humanism above.\n\nGORDON STEIN\n\n"An Anthology of Atheism and Rationalism", Prometheus Books\nAn anthology covering a wide range of subjects, including \'The Devil, Evil\nand Morality\' and \'The History of Freethought\'. Comprehensive bibliography.\n\nEDMUND D. COHEN\n\n"The Mind of The Bible-Believer", Prometheus Books\nA study of why people become Christian fundamentalists, and what effect it\nhas on them.\n\n Net Resources\n\nThere\'s a small mail-based archive server at mantis.co.uk which carries\narchives of old alt.atheism.moderated articles and assorted other files. For\nmore information, send mail to archive-server@mantis.co.uk saying\n\n help\n send atheism/index\n\nand it will mail back a reply.\n\n\nmathew\n\n'
In [8]:
Copied!
np.unique(df.target)
np.unique(df.target)
Out[8]:
array(['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
'sci.electronics', 'sci.med', 'sci.space',
'soc.religion.christian', 'talk.politics.guns',
'talk.politics.mideast', 'talk.politics.misc',
'talk.religion.misc'], dtype=object)
查看样本均衡与否¶
通过看每个类占总类别的百分比,可以看到总体是比较均衡
In [9]:
Copied!
for i in ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
'sci.electronics', 'sci.med', 'sci.space',
'soc.religion.christian', 'talk.politics.guns',
'talk.politics.mideast', 'talk.politics.misc',
'talk.religion.misc']:
print(i,(df.target==i).sum()/len(df.target))
for i in ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
'sci.electronics', 'sci.med', 'sci.space',
'soc.religion.christian', 'talk.politics.guns',
'talk.politics.mideast', 'talk.politics.misc',
'talk.religion.misc']:
print(i,(df.target==i).sum()/len(df.target))
alt.atheism 0.050007501125168774 comp.graphics 0.050007501125168774 comp.os.ms-windows.misc 0.050007501125168774 comp.sys.ibm.pc.hardware 0.050007501125168774 comp.sys.mac.hardware 0.050007501125168774 comp.windows.x 0.050007501125168774 misc.forsale 0.050007501125168774 rec.autos 0.050007501125168774 rec.motorcycles 0.050007501125168774 rec.sport.baseball 0.050007501125168774 rec.sport.hockey 0.050007501125168774 sci.crypt 0.050007501125168774 sci.electronics 0.050007501125168774 sci.med 0.050007501125168774 sci.space 0.050007501125168774 soc.religion.christian 0.049857478621793266 talk.politics.guns 0.050007501125168774 talk.politics.mideast 0.050007501125168774 talk.politics.misc 0.050007501125168774 talk.religion.misc 0.050007501125168774
数据集划分¶
In [10]:
Copied!
x_train, x_test, y_train, y_test = train_test_split(df.data,df.target,test_size=0.2,random_state=42)
x_train, x_test, y_train, y_test = train_test_split(df.data,df.target,test_size=0.2,random_state=42)
文本特征提取¶
使用TF-IDF向量
In [11]:
Copied!
tfidf=TFIDF().fit(x_train)
Xtrain_=tfidf.transform(x_train)
Xtest_=tfidf.transform(x_test)
tfidf=TFIDF().fit(x_train)
Xtrain_=tfidf.transform(x_train)
Xtest_=tfidf.transform(x_test)
三种不同分布的朴素贝叶斯模型¶
接下来使用三种不同分布的朴素贝叶斯模型先进行预测。分别是Multinomial,Complement,Bernuolli
由于这里的矩阵比较稀疏,因此这里不宜使用高斯朴素贝叶斯。
In [12]:
Copied!
name=['Multinomial','Complement','Bernuolli']
models=[MultinomialNB(),ComplementNB(),BernoulliNB()]
for name,mod in zip(name,models):
times=time()
mod.fit(Xtrain_,y_train)
y_pred=mod.predict(Xtest_)
score=mod.score(Xtest_,y_test)
print(name)
print("\tAccuracy:{:.3f}".format(score))
print(classification_report(y_test, y_pred, target_names=np.unique(df.target)))
print(datetime.datetime.fromtimestamp(time()-times).strftime("%M:%S:%f"))
name=['Multinomial','Complement','Bernuolli']
models=[MultinomialNB(),ComplementNB(),BernoulliNB()]
for name,mod in zip(name,models):
times=time()
mod.fit(Xtrain_,y_train)
y_pred=mod.predict(Xtest_)
score=mod.score(Xtest_,y_test)
print(name)
print("\tAccuracy:{:.3f}".format(score))
print(classification_report(y_test, y_pred, target_names=np.unique(df.target)))
print(datetime.datetime.fromtimestamp(time()-times).strftime("%M:%S:%f"))
Multinomial
Accuracy:0.887
precision recall f1-score support
alt.atheism 0.78 0.84 0.81 237
comp.graphics 0.86 0.84 0.85 237
comp.os.ms-windows.misc 0.94 0.87 0.91 271
comp.sys.ibm.pc.hardware 0.81 0.89 0.85 247
comp.sys.mac.hardware 0.90 0.93 0.91 253
comp.windows.x 0.94 0.89 0.92 237
misc.forsale 0.93 0.79 0.86 247
rec.autos 0.87 0.92 0.89 254
rec.motorcycles 0.98 0.94 0.96 259
rec.sport.baseball 0.96 0.94 0.95 244
rec.sport.hockey 0.93 0.97 0.95 251
sci.crypt 0.89 0.98 0.93 242
sci.electronics 0.94 0.80 0.87 269
sci.med 1.00 0.89 0.94 256
sci.space 0.93 0.96 0.94 262
soc.religion.christian 0.87 1.00 0.93 260
talk.politics.guns 0.83 0.94 0.88 253
talk.politics.mideast 0.94 0.94 0.94 251
talk.politics.misc 0.77 0.76 0.77 242
talk.religion.misc 0.66 0.59 0.62 228
accuracy 0.89 5000
macro avg 0.89 0.89 0.88 5000
weighted avg 0.89 0.89 0.89 5000
00:00:390859
Complement
Accuracy:0.902
precision recall f1-score support
alt.atheism 0.77 0.83 0.80 237
comp.graphics 0.86 0.86 0.86 237
comp.os.ms-windows.misc 0.92 0.90 0.91 271
comp.sys.ibm.pc.hardware 0.86 0.83 0.84 247
comp.sys.mac.hardware 0.93 0.91 0.92 253
comp.windows.x 0.91 0.93 0.92 237
misc.forsale 0.91 0.85 0.88 247
rec.autos 0.91 0.94 0.92 254
rec.motorcycles 0.97 0.99 0.98 259
rec.sport.baseball 0.97 0.96 0.97 244
rec.sport.hockey 0.93 0.99 0.96 251
sci.crypt 0.97 0.99 0.98 242
sci.electronics 0.93 0.91 0.92 269
sci.med 0.99 0.96 0.97 256
sci.space 0.93 0.98 0.96 262
soc.religion.christian 0.90 1.00 0.95 260
talk.politics.guns 0.87 0.92 0.89 253
talk.politics.mideast 0.91 0.97 0.94 251
talk.politics.misc 0.83 0.75 0.79 242
talk.religion.misc 0.68 0.52 0.59 228
accuracy 0.90 5000
macro avg 0.90 0.90 0.90 5000
weighted avg 0.90 0.90 0.90 5000
00:00:394772
Bernuolli
Accuracy:0.853
precision recall f1-score support
alt.atheism 0.76 0.81 0.78 237
comp.graphics 0.68 0.96 0.80 237
comp.os.ms-windows.misc 0.98 0.19 0.32 271
comp.sys.ibm.pc.hardware 0.79 0.96 0.87 247
comp.sys.mac.hardware 0.95 0.98 0.96 253
comp.windows.x 0.88 0.87 0.87 237
misc.forsale 0.47 0.96 0.63 247
rec.autos 0.96 0.91 0.94 254
rec.motorcycles 1.00 0.95 0.97 259
rec.sport.baseball 0.99 0.94 0.96 244
rec.sport.hockey 1.00 0.93 0.96 251
sci.crypt 0.96 0.93 0.94 242
sci.electronics 0.92 0.95 0.93 269
sci.med 1.00 0.84 0.91 256
sci.space 0.98 0.89 0.93 262
soc.religion.christian 0.99 0.97 0.98 260
talk.politics.guns 0.92 0.91 0.91 253
talk.politics.mideast 0.94 0.80 0.87 251
talk.politics.misc 0.81 0.71 0.76 242
talk.religion.misc 0.66 0.62 0.64 228
accuracy 0.85 5000
macro avg 0.88 0.85 0.85 5000
weighted avg 0.89 0.85 0.85 5000
00:00:551102
可以看到,在目前还没有进行参数调整的情况下,Complement的Accuracy表现最好,但另外两种表现也不错。
交叉验证&选择最优超参数¶
In [15]:
Copied!
# 交叉验证
cv1= []
cv2= []
cv3= []
mean1=[]
mean2=[]
mean3=[]
search_list = [0.00001,0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.2,0.5,0.7,0.9,1]
for i in search_list:
accuracy1 = []
accuracy2 = []
accuracy3 = []
print("alpha:",i)
for j in range(5):
print("-------------fold{:.1f}------------".format(j))
x_train, x_test, y_train, y_test = train_test_split(df.data,df.target,test_size=0.2,random_state=42,shuffle=True)
tfidf=TFIDF().fit(x_train)
Xtrain_=tfidf.transform(x_train)
Xtest_=tfidf.transform(x_test)
model1 = MultinomialNB(alpha=i)
model1.fit(Xtrain_, y_train)
val_accuracy1= model1.score(Xtest_, y_test)
model2 = ComplementNB(alpha=i)
model2.fit(Xtrain_, y_train)
val_accuracy2= model2.score(Xtest_, y_test)
model3 = BernoulliNB(alpha=i)
model3.fit(Xtrain_, y_train)
val_accuracy3= model3.score(Xtest_, y_test)
print(j+1,"val_accuracy1:", val_accuracy1)
print(j+1,"val_accuracy2:", val_accuracy2)
print(j+1,"val_accuracy3:", val_accuracy3)
accuracy1.append(val_accuracy1)
accuracy2.append(val_accuracy2)
accuracy3.append(val_accuracy3)
acc_sum=0
for t in range(len(accuracy1)):
acc_sum=acc_sum+accuracy1[t]
print("alpha:",i,"accuracy1:",acc_sum/5)
acc_sum=0
for t in range(len(accuracy2)):
acc_sum=acc_sum+accuracy2[t]
print("alpha:",i,"accuracy2:",acc_sum/5)
acc_sum=0
for t in range(len(accuracy3)):
acc_sum=acc_sum+accuracy3[t]
print("alpha:",i,"accuracy3:",acc_sum/5)
print("\n")
cv1.append(accuracy1)
cv2.append(accuracy2)
cv3.append(accuracy3)
for i in cv1:
mean1.append(np.mean(i))
for i in cv2:
mean2.append(np.mean(i))
for i in cv3:
mean3.append(np.mean(i))
# 交叉验证
cv1= []
cv2= []
cv3= []
mean1=[]
mean2=[]
mean3=[]
search_list = [0.00001,0.0001,0.0005,0.001,0.005,0.01,0.05,0.1,0.2,0.5,0.7,0.9,1]
for i in search_list:
accuracy1 = []
accuracy2 = []
accuracy3 = []
print("alpha:",i)
for j in range(5):
print("-------------fold{:.1f}------------".format(j))
x_train, x_test, y_train, y_test = train_test_split(df.data,df.target,test_size=0.2,random_state=42,shuffle=True)
tfidf=TFIDF().fit(x_train)
Xtrain_=tfidf.transform(x_train)
Xtest_=tfidf.transform(x_test)
model1 = MultinomialNB(alpha=i)
model1.fit(Xtrain_, y_train)
val_accuracy1= model1.score(Xtest_, y_test)
model2 = ComplementNB(alpha=i)
model2.fit(Xtrain_, y_train)
val_accuracy2= model2.score(Xtest_, y_test)
model3 = BernoulliNB(alpha=i)
model3.fit(Xtrain_, y_train)
val_accuracy3= model3.score(Xtest_, y_test)
print(j+1,"val_accuracy1:", val_accuracy1)
print(j+1,"val_accuracy2:", val_accuracy2)
print(j+1,"val_accuracy3:", val_accuracy3)
accuracy1.append(val_accuracy1)
accuracy2.append(val_accuracy2)
accuracy3.append(val_accuracy3)
acc_sum=0
for t in range(len(accuracy1)):
acc_sum=acc_sum+accuracy1[t]
print("alpha:",i,"accuracy1:",acc_sum/5)
acc_sum=0
for t in range(len(accuracy2)):
acc_sum=acc_sum+accuracy2[t]
print("alpha:",i,"accuracy2:",acc_sum/5)
acc_sum=0
for t in range(len(accuracy3)):
acc_sum=acc_sum+accuracy3[t]
print("alpha:",i,"accuracy3:",acc_sum/5)
print("\n")
cv1.append(accuracy1)
cv2.append(accuracy2)
cv3.append(accuracy3)
for i in cv1:
mean1.append(np.mean(i))
for i in cv2:
mean2.append(np.mean(i))
for i in cv3:
mean3.append(np.mean(i))
alpha: 1e-05 -------------fold0.0------------ 1 val_accuracy1: 0.8735 1 val_accuracy2: 0.859 1 val_accuracy3: 0.9035 -------------fold1.0------------ 2 val_accuracy1: 0.8735 2 val_accuracy2: 0.859 2 val_accuracy3: 0.9035 -------------fold2.0------------ 3 val_accuracy1: 0.8735 3 val_accuracy2: 0.859 3 val_accuracy3: 0.9035 -------------fold3.0------------ 4 val_accuracy1: 0.8735 4 val_accuracy2: 0.859 4 val_accuracy3: 0.9035 -------------fold4.0------------ 5 val_accuracy1: 0.8735 5 val_accuracy2: 0.859 5 val_accuracy3: 0.9035 alpha: 1e-05 accuracy1: 0.8735000000000002 alpha: 1e-05 accuracy2: 0.859 alpha: 1e-05 accuracy3: 0.9035 alpha: 0.0001 -------------fold0.0------------ 1 val_accuracy1: 0.8795 1 val_accuracy2: 0.8655 1 val_accuracy3: 0.90425 -------------fold1.0------------ 2 val_accuracy1: 0.8795 2 val_accuracy2: 0.8655 2 val_accuracy3: 0.90425 -------------fold2.0------------ 3 val_accuracy1: 0.8795 3 val_accuracy2: 0.8655 3 val_accuracy3: 0.90425 -------------fold3.0------------ 4 val_accuracy1: 0.8795 4 val_accuracy2: 0.8655 4 val_accuracy3: 0.90425 -------------fold4.0------------ 5 val_accuracy1: 0.8795 5 val_accuracy2: 0.8655 5 val_accuracy3: 0.90425 alpha: 0.0001 accuracy1: 0.8795 alpha: 0.0001 accuracy2: 0.8655000000000002 alpha: 0.0001 accuracy3: 0.90425 alpha: 0.0005 -------------fold0.0------------ 1 val_accuracy1: 0.883 1 val_accuracy2: 0.87 1 val_accuracy3: 0.904 -------------fold1.0------------ 2 val_accuracy1: 0.883 2 val_accuracy2: 0.87 2 val_accuracy3: 0.904 -------------fold2.0------------ 3 val_accuracy1: 0.883 3 val_accuracy2: 0.87 3 val_accuracy3: 0.904 -------------fold3.0------------ 4 val_accuracy1: 0.883 4 val_accuracy2: 0.87 4 val_accuracy3: 0.904 -------------fold4.0------------ 5 val_accuracy1: 0.883 5 val_accuracy2: 0.87 5 val_accuracy3: 0.904 alpha: 0.0005 accuracy1: 0.883 alpha: 0.0005 accuracy2: 0.8699999999999999 alpha: 0.0005 accuracy3: 0.9040000000000001 alpha: 0.001 -------------fold0.0------------ 1 val_accuracy1: 0.88525 1 val_accuracy2: 0.874 1 val_accuracy3: 0.90175 -------------fold1.0------------ 2 val_accuracy1: 0.88525 2 val_accuracy2: 0.874 2 val_accuracy3: 0.90175 -------------fold2.0------------ 3 val_accuracy1: 0.88525 3 val_accuracy2: 0.874 3 val_accuracy3: 0.90175 -------------fold3.0------------ 4 val_accuracy1: 0.88525 4 val_accuracy2: 0.874 4 val_accuracy3: 0.90175 -------------fold4.0------------ 5 val_accuracy1: 0.88525 5 val_accuracy2: 0.874 5 val_accuracy3: 0.90175 alpha: 0.001 accuracy1: 0.8852499999999999 alpha: 0.001 accuracy2: 0.874 alpha: 0.001 accuracy3: 0.90175 alpha: 0.005 -------------fold0.0------------ 1 val_accuracy1: 0.8895 1 val_accuracy2: 0.88175 1 val_accuracy3: 0.89875 -------------fold1.0------------ 2 val_accuracy1: 0.8895 2 val_accuracy2: 0.88175 2 val_accuracy3: 0.89875 -------------fold2.0------------ 3 val_accuracy1: 0.8895 3 val_accuracy2: 0.88175 3 val_accuracy3: 0.89875 -------------fold3.0------------ 4 val_accuracy1: 0.8895 4 val_accuracy2: 0.88175 4 val_accuracy3: 0.89875 -------------fold4.0------------ 5 val_accuracy1: 0.8895 5 val_accuracy2: 0.88175 5 val_accuracy3: 0.89875 alpha: 0.005 accuracy1: 0.8895 alpha: 0.005 accuracy2: 0.88175 alpha: 0.005 accuracy3: 0.89875 alpha: 0.01 -------------fold0.0------------ 1 val_accuracy1: 0.8925 1 val_accuracy2: 0.885 1 val_accuracy3: 0.89825 -------------fold1.0------------ 2 val_accuracy1: 0.8925 2 val_accuracy2: 0.885 2 val_accuracy3: 0.89825 -------------fold2.0------------ 3 val_accuracy1: 0.8925 3 val_accuracy2: 0.885 3 val_accuracy3: 0.89825 -------------fold3.0------------ 4 val_accuracy1: 0.8925 4 val_accuracy2: 0.885 4 val_accuracy3: 0.89825 -------------fold4.0------------ 5 val_accuracy1: 0.8925 5 val_accuracy2: 0.885 5 val_accuracy3: 0.89825 alpha: 0.01 accuracy1: 0.8924999999999998 alpha: 0.01 accuracy2: 0.885 alpha: 0.01 accuracy3: 0.89825 alpha: 0.05 -------------fold0.0------------ 1 val_accuracy1: 0.8965 1 val_accuracy2: 0.89 1 val_accuracy3: 0.892 -------------fold1.0------------ 2 val_accuracy1: 0.8965 2 val_accuracy2: 0.89 2 val_accuracy3: 0.892 -------------fold2.0------------ 3 val_accuracy1: 0.8965 3 val_accuracy2: 0.89 3 val_accuracy3: 0.892 -------------fold3.0------------ 4 val_accuracy1: 0.8965 4 val_accuracy2: 0.89 4 val_accuracy3: 0.892 -------------fold4.0------------ 5 val_accuracy1: 0.8965 5 val_accuracy2: 0.89 5 val_accuracy3: 0.892 alpha: 0.05 accuracy1: 0.8965 alpha: 0.05 accuracy2: 0.89 alpha: 0.05 accuracy3: 0.892 alpha: 0.1 -------------fold0.0------------ 1 val_accuracy1: 0.895 1 val_accuracy2: 0.89425 1 val_accuracy3: 0.88775 -------------fold1.0------------ 2 val_accuracy1: 0.895 2 val_accuracy2: 0.89425 2 val_accuracy3: 0.88775 -------------fold2.0------------ 3 val_accuracy1: 0.895 3 val_accuracy2: 0.89425 3 val_accuracy3: 0.88775 -------------fold3.0------------ 4 val_accuracy1: 0.895 4 val_accuracy2: 0.89425 4 val_accuracy3: 0.88775 -------------fold4.0------------ 5 val_accuracy1: 0.895 5 val_accuracy2: 0.89425 5 val_accuracy3: 0.88775 alpha: 0.1 accuracy1: 0.8949999999999999 alpha: 0.1 accuracy2: 0.8942499999999999 alpha: 0.1 accuracy3: 0.8877500000000002 alpha: 0.2 -------------fold0.0------------ 1 val_accuracy1: 0.89525 1 val_accuracy2: 0.899 1 val_accuracy3: 0.88125 -------------fold1.0------------ 2 val_accuracy1: 0.89525 2 val_accuracy2: 0.899 2 val_accuracy3: 0.88125 -------------fold2.0------------ 3 val_accuracy1: 0.89525 3 val_accuracy2: 0.899 3 val_accuracy3: 0.88125 -------------fold3.0------------ 4 val_accuracy1: 0.89525 4 val_accuracy2: 0.899 4 val_accuracy3: 0.88125 -------------fold4.0------------ 5 val_accuracy1: 0.89525 5 val_accuracy2: 0.899 5 val_accuracy3: 0.88125 alpha: 0.2 accuracy1: 0.8952500000000001 alpha: 0.2 accuracy2: 0.899 alpha: 0.2 accuracy3: 0.88125 alpha: 0.5 -------------fold0.0------------ 1 val_accuracy1: 0.896 1 val_accuracy2: 0.9035 1 val_accuracy3: 0.86525 -------------fold1.0------------ 2 val_accuracy1: 0.896 2 val_accuracy2: 0.9035 2 val_accuracy3: 0.86525 -------------fold2.0------------ 3 val_accuracy1: 0.896 3 val_accuracy2: 0.9035 3 val_accuracy3: 0.86525 -------------fold3.0------------ 4 val_accuracy1: 0.896 4 val_accuracy2: 0.9035 4 val_accuracy3: 0.86525 -------------fold4.0------------ 5 val_accuracy1: 0.896 5 val_accuracy2: 0.9035 5 val_accuracy3: 0.86525 alpha: 0.5 accuracy1: 0.8960000000000001 alpha: 0.5 accuracy2: 0.9035 alpha: 0.5 accuracy3: 0.86525 alpha: 0.7 -------------fold0.0------------ 1 val_accuracy1: 0.89375 1 val_accuracy2: 0.9035 1 val_accuracy3: 0.858 -------------fold1.0------------ 2 val_accuracy1: 0.89375 2 val_accuracy2: 0.9035 2 val_accuracy3: 0.858 -------------fold2.0------------ 3 val_accuracy1: 0.89375 3 val_accuracy2: 0.9035 3 val_accuracy3: 0.858 -------------fold3.0------------ 4 val_accuracy1: 0.89375 4 val_accuracy2: 0.9035 4 val_accuracy3: 0.858 -------------fold4.0------------ 5 val_accuracy1: 0.89375 5 val_accuracy2: 0.9035 5 val_accuracy3: 0.858 alpha: 0.7 accuracy1: 0.89375 alpha: 0.7 accuracy2: 0.9035 alpha: 0.7 accuracy3: 0.858 alpha: 0.9 -------------fold0.0------------ 1 val_accuracy1: 0.8925 1 val_accuracy2: 0.90475 1 val_accuracy3: 0.84625 -------------fold1.0------------ 2 val_accuracy1: 0.8925 2 val_accuracy2: 0.90475 2 val_accuracy3: 0.84625 -------------fold2.0------------ 3 val_accuracy1: 0.8925 3 val_accuracy2: 0.90475 3 val_accuracy3: 0.84625 -------------fold3.0------------ 4 val_accuracy1: 0.8925 4 val_accuracy2: 0.90475 4 val_accuracy3: 0.84625 -------------fold4.0------------ 5 val_accuracy1: 0.8925 5 val_accuracy2: 0.90475 5 val_accuracy3: 0.84625 alpha: 0.9 accuracy1: 0.8924999999999998 alpha: 0.9 accuracy2: 0.9047500000000002 alpha: 0.9 accuracy3: 0.8462499999999998 alpha: 1 -------------fold0.0------------ 1 val_accuracy1: 0.89175 1 val_accuracy2: 0.905 1 val_accuracy3: 0.842 -------------fold1.0------------ 2 val_accuracy1: 0.89175 2 val_accuracy2: 0.905 2 val_accuracy3: 0.842 -------------fold2.0------------ 3 val_accuracy1: 0.89175 3 val_accuracy2: 0.905 3 val_accuracy3: 0.842 -------------fold3.0------------ 4 val_accuracy1: 0.89175 4 val_accuracy2: 0.905 4 val_accuracy3: 0.842 -------------fold4.0------------ 5 val_accuracy1: 0.89175 5 val_accuracy2: 0.905 5 val_accuracy3: 0.842 alpha: 1 accuracy1: 0.89175 alpha: 1 accuracy2: 0.905 alpha: 1 accuracy3: 0.842
绘图¶
大致的参数与正确率曲线
In [17]:
Copied!
df_plot = pd.DataFrame()
df_plot['alpha'] = search_list
df_plot['Multinomial Classification Accuracy'] = mean1
df_plot['Complement Classification Accuracy'] = mean2
df_plot['Bernuolli Classification Accuracy'] = mean3
df_plot = pd.DataFrame(df_plot).set_index('alpha')
sns.lineplot(data=df_plot)
plt.title("comparison of classification accuracy")
plt.show()
df_plot = pd.DataFrame()
df_plot['alpha'] = search_list
df_plot['Multinomial Classification Accuracy'] = mean1
df_plot['Complement Classification Accuracy'] = mean2
df_plot['Bernuolli Classification Accuracy'] = mean3
df_plot = pd.DataFrame(df_plot).set_index('alpha')
sns.lineplot(data=df_plot)
plt.title("comparison of classification accuracy")
plt.show()
5折交叉验证误差图
In [20]:
Copied!
dict_=dict(zip(search_list,cv2))
accu_mean=[]
accu_std=[]
for i,j in dict_.items():
accu_mean.append(np.mean(j))
accu_std.append(np.std(j))
for i in search_list:
accus = dict_[i]
plt.scatter([i] * len(accus), accus)
plt.errorbar(search_list, accu_mean, accu_std,ecolor='green')
plt.title('5-Fold Cross validation')
plt.xlabel('alpha')
plt.ylabel('Cross validation mean accuracy')
plt.show()
dict_=dict(zip(search_list,cv2))
accu_mean=[]
accu_std=[]
for i,j in dict_.items():
accu_mean.append(np.mean(j))
accu_std.append(np.std(j))
for i in search_list:
accus = dict_[i]
plt.scatter([i] * len(accus), accus)
plt.errorbar(search_list, accu_mean, accu_std,ecolor='green')
plt.title('5-Fold Cross validation')
plt.xlabel('alpha')
plt.ylabel('Cross validation mean accuracy')
plt.show()
最后达到的效果:¶
In [22]:
Copied!
alpha_best=search_list[np.argmax(mean2)]
print(alpha_best)
alpha_best=search_list[np.argmax(mean2)]
print(alpha_best)
1
In [23]:
Copied!
model = ComplementNB(alpha=alpha_best)
model.fit(Xtrain_, y_train)
y_pred=model.predict(Xtest_)
val_accuracy= model.score(Xtest_, y_test)
print(classification_report(y_test, y_pred, target_names=np.unique(df.target)))
model = ComplementNB(alpha=alpha_best)
model.fit(Xtrain_, y_train)
y_pred=model.predict(Xtest_)
val_accuracy= model.score(Xtest_, y_test)
print(classification_report(y_test, y_pred, target_names=np.unique(df.target)))
precision recall f1-score support
alt.atheism 0.77 0.81 0.79 173
comp.graphics 0.88 0.86 0.87 179
comp.os.ms-windows.misc 0.93 0.93 0.93 226
comp.sys.ibm.pc.hardware 0.91 0.83 0.87 204
comp.sys.mac.hardware 0.93 0.93 0.93 205
comp.windows.x 0.92 0.92 0.92 186
misc.forsale 0.90 0.85 0.88 190
rec.autos 0.90 0.96 0.93 203
rec.motorcycles 0.98 1.00 0.99 218
rec.sport.baseball 0.98 0.96 0.97 192
rec.sport.hockey 0.93 0.99 0.96 203
sci.crypt 0.97 0.99 0.98 200
sci.electronics 0.93 0.92 0.92 227
sci.med 0.98 0.97 0.98 196
sci.space 0.93 0.99 0.96 205
soc.religion.christian 0.90 1.00 0.95 215
talk.politics.guns 0.87 0.90 0.89 205
talk.politics.mideast 0.91 0.98 0.94 197
talk.politics.misc 0.82 0.71 0.76 200
talk.religion.misc 0.64 0.51 0.57 176
accuracy 0.91 4000
macro avg 0.90 0.90 0.90 4000
weighted avg 0.90 0.91 0.90 4000
最后达到91%的准确率.