基于Python的簡(jiǎn)單自然語(yǔ)言處理實(shí)踐
基于 Python 的簡(jiǎn)單自然語(yǔ)言處理
本文是對(duì)于基于 Python 進(jìn)行簡(jiǎn)單自然語(yǔ)言處理任務(wù)的介紹,本文的所有代碼放置在這里。建議前置閱讀 Python 語(yǔ)法速覽與機(jī)器學(xué)習(xí)開(kāi)發(fā)環(huán)境搭建,更多機(jī)器學(xué)習(xí)資料參考機(jī)器學(xué)習(xí)、深度學(xué)習(xí)與自然語(yǔ)言處理領(lǐng)域推薦的書(shū)籍列表以及面向程序猿的數(shù)據(jù)科學(xué)與機(jī)器學(xué)習(xí)知識(shí)體系及資料合集。
Twenty News Group 語(yǔ)料集處理
20 Newsgroup 數(shù)據(jù)集包含了約 20000 篇來(lái)自于不同的新聞組的文檔,最早由 Ken Lang 搜集整理。本部分包含了對(duì)于數(shù)據(jù)集的抓取、特征提取、簡(jiǎn)單分類(lèi)器訓(xùn)練、主題模型訓(xùn)練等。本部分代碼包括主要的處理代碼封裝庫(kù)與基于 Notebook 的交互示范。我們首先需要進(jìn)行數(shù)據(jù)抓取:
- def fetch_data(self, subset='train', categories=None):
- """return data
- 執(zhí)行數(shù)據(jù)抓取操作
- Arguments:
- subset -> string -- 抓取的目標(biāo)集合 train / test / all
- """
- rand = np.random.mtrand.RandomState(8675309)
- data = fetch_20newsgroups(subset=subset,
- categories=categories,
- shuffle=True,
- random_state=rand)
- self.data[subset] = data
然后在 Notebook 中交互查看數(shù)據(jù)格式:
- # 實(shí)例化對(duì)象
- twp = TwentyNewsGroup()
- # 抓取數(shù)據(jù)
- twp.fetch_data()
- twenty_train = twp.data['train']
- print("數(shù)據(jù)集結(jié)構(gòu)", "->", twenty_train.keys())
- print("文檔數(shù)目", "->", len(twenty_train.data))
- print("目標(biāo)分類(lèi)", "->",[ twenty_train.target_names[t] for t in twenty_train.target[:10]])
- 數(shù)據(jù)集結(jié)構(gòu) -> dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR', 'description'])
- 文檔數(shù)目 -> 11314
- 目標(biāo)分類(lèi) -> ['sci.space', 'comp.sys.mac.hardware', 'sci.electronics', 'comp.sys.mac.hardware', 'sci.space', 'rec.sport.hockey', 'talk.religion.misc', 'sci.med', 'talk.religion.misc', 'talk.politics.guns']
接下來(lái)我們可以對(duì)語(yǔ)料集中的特征進(jìn)行提?。?/p>
- # 進(jìn)行特征提取
- # 構(gòu)建文檔-詞矩陣(Document-Term Matrix)
- from sklearn.feature_extraction.text import CountVectorizer
- count_vect = CountVectorizer()
- X_train_counts = count_vect.fit_transform(twenty_train.data)
- print("DTM 結(jié)構(gòu)","->",X_train_counts.shape)
- # 查看某個(gè)詞在詞表中的下標(biāo)
- print("詞對(duì)應(yīng)下標(biāo)","->", count_vect.vocabulary_.get(u'algorithm'))
- DTM 結(jié)構(gòu) -> (11314, 130107)
- 詞對(duì)應(yīng)下標(biāo) -> 27366
為了將文檔用于進(jìn)行分類(lèi)任務(wù),還需要使用 TF-IDF 等常見(jiàn)方法將其轉(zhuǎn)化為特征向量:
- # 構(gòu)建文檔的 TF 特征向量
- from sklearn.feature_extraction.text import TfidfTransformer
- tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
- X_train_tf = tf_transformer.transform(X_train_counts)
- print("某文檔 TF 特征向量","->",X_train_tf)
- # 構(gòu)建文檔的 TF-IDF 特征向量
- from sklearn.feature_extraction.text import TfidfTransformer
- tf_transformer = TfidfTransformer().fit(X_train_counts)
- X_train_tfidf = tf_transformer.transform(X_train_counts)
- print("某文檔 TF-IDF 特征向量","->",X_train_tfidf)
- 某文檔 TF 特征向量 -> (0, 6447) 0.0380693493813
- (0, 37842) 0.0380693493813
我們可以將特征提取、分類(lèi)器訓(xùn)練與預(yù)測(cè)封裝為單獨(dú)函數(shù):
- def extract_feature(self):
- """
- 從語(yǔ)料集中抽取文檔特征
- """
- # 獲取訓(xùn)練數(shù)據(jù)的文檔-詞矩陣
- self.train_dtm = self.count_vect.fit_transform(self.data['train'].data)
- # 獲取文檔的 TF 特征
- tf_transformer = TfidfTransformer(use_idf=False)
- self.train_tf = tf_transformer.transform(self.train_dtm)
- # 獲取文檔的 TF-IDF 特征
- tfidf_transformer = TfidfTransformer().fit(self.train_dtm)
- self.train_tfidf = tf_transformer.transform(self.train_dtm)
- def train_classifier(self):
- """
- 從訓(xùn)練集中訓(xùn)練出分類(lèi)器
- """
- self.extract_feature();
- self.clf = MultinomialNB().fit(
- self.train_tfidf, self.data['train'].target)
- def predict(self, docs):
- """
- 從訓(xùn)練集中訓(xùn)練出分類(lèi)器
- """
- X_new_counts = self.count_vect.transform(docs)
- tfidf_transformer = TfidfTransformer().fit(X_new_counts)
- X_new_tfidf = tfidf_transformer.transform(X_new_counts)
- return self.clf.predict(X_new_tfidf)
然后執(zhí)行訓(xùn)練并且進(jìn)行預(yù)測(cè)與評(píng)價(jià):
- # 訓(xùn)練分類(lèi)器
- twp.train_classifier()
- # 執(zhí)行預(yù)測(cè)
- docs_new = ['God is love', 'OpenGL on the GPU is fast']
- predicted = twp.predict(docs_new)
- for doc, category in zip(docs_new, predicted):
- print('%r => %s' % (doc, twenty_train.target_names[category]))
- # 執(zhí)行模型評(píng)測(cè)
- twp.fetch_data(subset='test')
- predicted = twp.predict(twp.data['test'].data)
- import numpy as np
- # 誤差計(jì)算
- # 簡(jiǎn)單誤差均值
- np.mean(predicted == twp.data['test'].target)
- # Metrics
- from sklearn import metrics
- print(metrics.classification_report(
- twp.data['test'].target, predicted,
- target_names=twp.data['test'].target_names))
- # Confusion Matrix
- metrics.confusion_matrix(twp.data['test'].target, predicted)
- 'God is love' => soc.religion.christian
- 'OpenGL on the GPU is fast' => rec.autos
- precision recall f1-score support
- alt.atheism 0.79 0.50 0.61 319
- ...
- talk.religion.misc 1.00 0.08 0.15 251
- avg / total 0.82 0.79 0.77 7532
- Out[16]:
- array([[158, 0, 1, 1, 0, 1, 0, 3, 7, 1, 2, 6, 1,
- 8, 3, 114, 6, 7, 0, 0],
- ...
- [ 35, 3, 1, 0, 0, 0, 1, 4, 1, 1, 6, 3, 0,
- 6, 5, 127, 30, 5, 2, 21]])
我們也可以對(duì)文檔集進(jìn)行主題提?。?/p>
# 進(jìn)行主題提取
- # 進(jìn)行主題提取
- twp.topics_by_lda()
- Topic 0 : stream s1 astronaut zoo laurentian maynard s2 gtoal pem fpu
- Topic 1 : 145 cx 0d bh sl 75u 6um m6 sy gld
- Topic 2 : apartment wpi mars nazis monash palestine ottoman sas winner gerard
- Topic 3 : livesey contest satellite tamu mathew orbital wpd marriage solntze pope
- Topic 4 : x11 contest lib font string contrib visual xterm ahl brake
- Topic 5 : ax g9v b8f a86 1d9 pl 0t wm 34u giz
- Topic 6 : printf null char manes behanna senate handgun civilians homicides magpie
- Topic 7 : buf jpeg chi tor bos det que uwo pit blah
- Topic 8 : oracle di t4 risc nist instruction msg postscript dma convex
- Topic 9 : candida cray yeast viking dog venus bloom symptoms observatory roby
- Topic 10 : cx ck hz lk mv cramer adl optilink k8 uw
- Topic 11 : ripem rsa sandvik w0 bosnia psuvm hudson utk defensive veal
- Topic 12 : db espn sabbath br widgets liar davidian urartu sdpa cooling
- Topic 13 : ripem dyer ucsu carleton adaptec tires chem alchemy lockheed rsa
- Topic 14 : ingr sv alomar jupiter borland het intergraph factory paradox captain
- Topic 15 : militia palestinian cpr pts handheld sharks igc apc jake lehigh
- Topic 16 : alaska duke col russia uoknor aurora princeton nsmca gene stereo
- Topic 17 : uuencode msg helmet eos satan dseg homosexual ics gear pyron
- Topic 18 : entries myers x11r4 radar remark cipher maine hamburg senior bontchev
- Topic 19 : cubs ufl vitamin temple gsfc mccall astro bellcore uranium wesleyan
常見(jiàn)自然語(yǔ)言處理工具封裝
經(jīng)過(guò)上面對(duì)于 20NewsGroup 語(yǔ)料集處理的介紹我們可以發(fā)現(xiàn)常見(jiàn)自然語(yǔ)言處理任務(wù)包括,數(shù)據(jù)獲取、數(shù)據(jù)預(yù)處理、數(shù)據(jù)特征提取、分類(lèi)模型訓(xùn)練、主題模型或者詞向量等高級(jí)特征提取等等。筆者還習(xí)慣用 python-fire 將類(lèi)快速封裝為可通過(guò)命令行調(diào)用的工具,同時(shí)也支持外部模塊調(diào)用使用。本部分我們主要以中文語(yǔ)料集為例,譬如我們需要對(duì)中文維基百科數(shù)據(jù)進(jìn)行分析,可以使用 gensim 中的維基百科處理類(lèi):
- class Wiki(object):
- """
- 維基百科語(yǔ)料集處理
- """
- def wiki2texts(self, wiki_data_path, wiki_texts_path='./wiki_texts.txt'):
- """
- 將維基百科數(shù)據(jù)轉(zhuǎn)化為文本數(shù)據(jù)
- Arguments:
- wiki_data_path -- 維基壓縮文件地址
- """
- if not wiki_data_path:
- print("請(qǐng)輸入 Wiki 壓縮文件路徑或者前往 https://dumps.wikimedia.org/zhwiki/ 下載")
- exit()
- # 構(gòu)建維基語(yǔ)料集
- wiki_corpus = WikiCorpus(wiki_data_path, dictionary={})
- texts_num = 0
- with open(wiki_text_path, 'w', encoding='utf-8') as output:
- for text in wiki_corpus.get_texts():
- output.write(b' '.join(text).decode('utf-8') + '\n')
- texts_num += 1
- if texts_num % 10000 == 0:
- logging.info("已處理 %d 篇文章" % texts_num)
- print("處理完畢,請(qǐng)使用 OpenCC 轉(zhuǎn)化為簡(jiǎn)體字")
抓取完畢后,我們還需要用 OpenCC 轉(zhuǎn)化為簡(jiǎn)體字。抓取完畢后我們可以使用結(jié)巴分詞對(duì)生成的文本文件進(jìn)行分詞,代碼參考這里,我們直接使用 python chinese_text_processor.py tokenize_file /output.txt 直接執(zhí)行該任務(wù)并且生成輸出文件。獲取分詞之后的文件,我們可以將其轉(zhuǎn)化為簡(jiǎn)單的詞袋表示或者文檔-詞向量,詳細(xì)代碼參考這里:
- class CorpusProcessor:
- """
- 語(yǔ)料集處理
- """
- def corpus2bow(self, tokenized_corpus=default_documents):
- """returns (vocab,corpus_in_bow)
- 將語(yǔ)料集轉(zhuǎn)化為 BOW 形式
- Arguments:
- tokenized_corpus -- 經(jīng)過(guò)分詞的文檔列表
- Return:
- vocab -- {'human': 0, ... 'minors': 11}
- corpus_in_bow -- [[(0, 1), (1, 1), (2, 1)]...]
- """
- dictionary = corpora.Dictionary(tokenized_corpus)
- # 獲取詞表
- vocab = dictionary.token2id
- # 獲取文檔的詞袋表示
- corpus_in_bow = [dictionary.doc2bow(text) for text in tokenized_corpus]
- return (vocab, corpus_in_bow)
- def corpus2dtm(self, tokenized_corpus=default_documents, min_df=10, max_df=100):
- """returns (vocab, DTM)
- 將語(yǔ)料集轉(zhuǎn)化為文檔-詞矩陣
- - dtm -> matrix: 文檔-詞矩陣
- I like hate databases
- D1 1 1 0 1
- D2 1 0 1 1
- """
- if type(tokenized_corpus[0]) is list:
- documents = [" ".join(document) for document in tokenized_corpus]
- else:
- documents = tokenized_corpus
- if max_df == -1:
- max_df = round(len(documents) / 2)
- # 構(gòu)建語(yǔ)料集統(tǒng)計(jì)向量
- vec = CountVectorizer(min_df=min_df,
- max_df=max_df,
- analyzer="word",
- token_pattern="[\S]+",
- tokenizer=None,
- preprocessor=None,
- stop_words=None
- )
- # 對(duì)于數(shù)據(jù)進(jìn)行分析
- DTM = vec.fit_transform(documents)
- # 獲取詞表
- vocab = vec.get_feature_names()
- return (vocab, DTM)
我們也可以對(duì)分詞之后的文檔進(jìn)行主題模型或者詞向量提取,這里使用分詞之后的文件就可以忽略中英文的差異:
- def topics_by_lda(self, tokenized_corpus_path, num_topics=20, num_words=10, max_lines=10000, split="\s+", max_df=100):
- """
- 讀入經(jīng)過(guò)分詞的文件并且對(duì)其進(jìn)行 LDA 訓(xùn)練
- Arguments:
- tokenized_corpus_path -> string -- 經(jīng)過(guò)分詞的語(yǔ)料集地址
- num_topics -> integer -- 主題數(shù)目
- num_words -> integer -- 主題詞數(shù)目
- max_lines -> integer -- 每次讀入的***行數(shù)
- split -> string -- 文檔的詞之間的分隔符
- max_df -> integer -- 避免常用詞,過(guò)濾超過(guò)該閾值的詞
- """
- # 存放所有語(yǔ)料集信息
- corpus = []
- with open(tokenized_corpus_path, 'r', encoding='utf-8') as tokenized_corpus:
- flag = 0
- for document in tokenized_corpus:
- # 判斷是否讀取了足夠的行數(shù)
- if(flag > max_lines):
- break
- # 將讀取到的內(nèi)容添加到語(yǔ)料集中
- corpus.append(re.split(split, document))
- flag = flag + 1
- # 構(gòu)建語(yǔ)料集的 BOW 表示
- (vocab, DTM) = self.corpus2dtm(corpus, max_df=max_df)
- # 訓(xùn)練 LDA 模型
- lda = LdaMulticore(
- matutils.Sparse2Corpus(DTM, documents_columns=False),
- num_topics=num_topics,
- id2word=dict([(i, s) for i, s in enumerate(vocab)]),
- workers=4
- )
- # 打印并且返回主題數(shù)據(jù)
- topics = lda.show_topics(
- num_topics=num_topics,
- num_words=num_words,
- formatted=False,
- log=False)
- for ti, topic in enumerate(topics):
- print("Topic", ti, ":", " ".join(word[0] for word in topic[1]))
該函數(shù)同樣可以使用命令行直接調(diào)用,傳入分詞之后的文件。我們也可以對(duì)其語(yǔ)料集建立詞向量,代碼參考這里;如果對(duì)于詞向量基本使用尚不熟悉的同學(xué)可以參考基于 Gensim 的 Word2Vec 實(shí)踐:
- def wv_train(self, tokenized_text_path, output_model_path='./wv_model.bin'):
- """
- 對(duì)于文本進(jìn)行詞向量訓(xùn)練,并將輸出的詞向量保存
- """
- sentences = word2vec.Text8Corpus(tokenized_text_path)
- # 進(jìn)行模型訓(xùn)練
- model = word2vec.Word2Vec(sentences, size=250)
- # 保存模型
- model.save(output_model_path)
- def wv_visualize(self, model_path, word=["中國(guó)", "航空"]):
- """
- 根據(jù)輸入的詞搜索鄰近詞然后可視化展示
- 參數(shù):
- model_path: Word2Vec 模型地址
- """
- # 加載模型
- model = word2vec.Word2Vec.load(model_path)
- # 尋找出最相似的多個(gè)詞
- words = [wp[0] for wp in model.most_similar(word, topn=20)]
- # 提取出詞對(duì)應(yīng)的詞向量
- wordsInVector = [model[word] for word in words]
- # 進(jìn)行 PCA 降維
- pca = PCA(n_components=2)
- pca.fit(wordsInVector)
- X = pca.transform(wordsInVector)
- # 繪制圖形
- xs = X[:, 0]
- ys = X[:, 1]
- plt.figure(figsize=(12, 8))
- plt.scatter(xs, ys, marker='o')
- # 遍歷所有的詞添加點(diǎn)注釋
- for i, w in enumerate(words):
- plt.annotate(
- w,
- xy=(xs[i], ys[i]), xytext=(6, 6),
- textcoords='offset points', ha='left', va='top',
- **dict(fontsize=10)
- )
- plt.show()
【本文是51CTO專(zhuān)欄作者“張梓雄 ”的原創(chuàng)文章,如需轉(zhuǎn)載請(qǐng)通過(guò)51CTO與作者聯(lián)系】