机器学习
1 监督式学习
分类算法 预测测试样本是否属于某些类型中的一个
回归算法 主要用于预测某种连续性的变量
2 无监督式学习
3 半监督式学习
4 增强式学习
文本分类
词汇文档矩阵(term-document matrix)
BOW(bag of word)
文本清理,获取标签,获取向量化
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import csv
import sklearn
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
def preprocessing(text):
# text = text.decode("utf-8")
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.sent_tokenize(sent)]
stop = stopwords.words('english')
tokens = [token for token in tokens if token not in stop]
tokens = [word for word in tokens if len(word) >= 3]
tokens = [word.lower() for word in tokens]
lmtzr = WordNetLemmatizer()
tokens = [lmtzr.lemmatize(word) for word in tokens]
preprocessed_text = ' '.join(tokens)
return preprocessed_text
#获取标签
smsdata = open('1')
sms_data = []
sms_labels = []
csv_reader =csv.reader(smsdata,delimiter='\t')
try:
for line in csv_reader:
sms_labels.append(line[0])
sms_data.append(preprocessing(line[1]))
except:
pass
#取样
import numpy as np
trainset_size =int(round(len(sms_data)*0.70))
x_train = np.array([''.join(el) for el in sms_data[0:trainset_size]])
y_train = np.array([el for el in sms_labels[0:trainset_size]])
x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]])
y_test = np.array([el for el in sms_labels[trainset_size+1:len(sms_labels)]])
print(x_train)
print(y_train)
#向量化器向量化
from sklearn.feature_extraction.text import CountVectorizer
sms_list = []
sms_exp = []
for i in x_train:
sms_list.append(i)
for line in sms_list:
sms_exp.append(preprocessing(line))
vectorizer = CountVectorizer(min_df = 1)
X_exp = vectorizer.fit_transform(sms_exp)
print("||".join(vectorizer.get_feature_names()))
print(X_exp.toarray()[0])
#tf-idf向量化
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df =2 ,ngram_range=(1,2),stop_words = 'english',strip_accents = 'unicode', norm ='12')
try:
X_train = vectorizer.fit_transform(x_train)
X_test = vectorizer.transform(x_test)
except:
pass
网友评论