### Python字符串处理
#### 去掉空格或者特殊字符
input_str = '今天天气不错,今天挺风和日丽的'
input_str.strip()
input_str.rstrip()
input_str.lstrip()
input_str = 'AAA今天天气不错,挺风和日丽的AAA'
input_str.strip('A')
input_str.lstrip('A')
input_str.rstrip('A')
#### 替换操作
input_str.replace('今天','昨天')
input_str.replace('今天','')
#### 查找操作
input_str.find('今天')
#### 判断操作
input_str = '123'
input_str.isalpha()
input_str.isdigit()
#### 分割合并操作
input_str = '今天 天气 不错,今天 挺 风和日丽 的'
input_str = input_str.split(' ')
input_str
''.join(input_str)
#### 帮助文档
help(str)
from nltk.tokenize import word_tokenize from nltk.text import Text ### NLTK工具包安装 # 非常实用的文本处理工具,主要用于英文数据,历史悠久 import nltk # pip install nltk nltk.download() #### 分词 from nltk.tokenize import word_tokenize from nltk.text import Text input_str = "Today's weather is good, very windy and sunny, we have no classes in the afternoon,We have to play basketball tomorrow." tokens = word_tokenize(input_str) tokens = [word.lower() for word in tokens] tokens[:5] #### Text对象 help(nltk.text) # 创建一个Text对象,方便后续操作 t = Text(tokens) t.count('good') t.index('good') t.plot(8) #### 停用词 # 可以看一下说明中的介绍 from nltk.corpus import stopwords stopwords.readme().replace('\n', ' ') stopwords.fileids() stopwords.raw('english').replace('\n', ' ') test_words = [word.lower() for word in tokens] test_words_set = set(test_words) test_words_set.intersection(set(stopwords.words('english'))) #### 过滤掉停用词 filtered = [w for w in test_words_set if (w not in stopwords.words('english'))] filtered #### 词性标注 nltk.download() # 第三个 from nltk import pos_tag tags = pos_tag(tokens) tags #### 分块 from nltk.chunk import RegexpParser sentence = [('the', 'DT'), ('little', 'JJ'), ('yellow', 'JJ'), ('dog', 'NN'), ('died', 'VBD')] grammer = "MY_NP: {<DT>?<JJ>*<NN>}" cp = nltk.RegexpParser(grammer) # 生成规则 result = cp.parse(sentence) # 进行分块 print(result) result.draw() # 调用matplotlib库画出来 #### 命名实体识别 nltk.download() # maxent_ne_chunke # words from nltk import ne_chunk sentence = "Edison went to Tsinghua University today." print(ne_chunk(pos_tag(word_tokenize(sentence)))) #### 数据清洗实例 import re from nltk.corpus import stopwords # 输入数据 s = ' RT @Amila #Test\nTom\'s newly listed Co & Mary\'s unlisted Group to supply tech for nlTK.\nh $TSLA $AAPL https:// t.co/x34afsfQsh' # 指定停用词 cache_english_stopwords = stopwords.words('english')
def text_clean(text): print('原始数据:', text, '\n') # 去掉HTML标签(e.g. &) text_no_special_entities = re.sub(r'\&\w*;|#\w*|@\w*', '', text) print('去掉特殊标签后的:', text_no_special_entities, '\n') # 去掉一些价值符号 text_no_tickers = re.sub(r'\$\w*', '', text_no_special_entities) print('去掉价值符号后的:', text_no_tickers, '\n') # 去掉超链接 text_no_hyperlinks = re.sub(r'https?:\/\/.*\/\w*', '', text_no_tickers) print('去掉超链接后的:', text_no_hyperlinks, '\n') # 去掉一些专门名词缩写,简单来说就是字母比较少的词 text_no_small_words = re.sub(r'\b\w{1,2}\b', '', text_no_hyperlinks) print('去掉专门名词缩写后:', text_no_small_words, '\n') # 去掉多余的空格 text_no_whitespace = re.sub(r'\s\s+', ' ', text_no_small_words) text_no_whitespace = text_no_whitespace.lstrip(' ') print('去掉空格后的:', text_no_whitespace, '\n') # 分词 tokens = word_tokenize(text_no_whitespace) print('分词结果:', tokens, '\n') # 去停用词 list_no_stopwords = [i for i in tokens if i not in cache_english_stopwords] print('去停用词后结果:', list_no_stopwords, '\n') # 过滤后结果 text_filtered = ' '.join(list_no_stopwords) # ''.join() would join without spaces between words. print('过滤后:', text_filtered) text_clean(s)
# [spaCy](http://spacy.io/docs/#examples) 介绍 # 导入工具包和英文模型 # python -m spacy download en 用管理员身份打开CMD import spacy nlp = spacy.load('en_core_web_sm') ### 文本处理 doc = nlp('Weather is good, very windy and sunny. We have no classes in the afternoon.') # 分词 for token in doc: print(token) # 分句 for sent in doc.sents: print(sent) ### 词性 for token in doc: print('{}-{}'.format(token, token.pos_)) ## 命名体识别 doc_2 = nlp("I went to Paris where I met my old friend Jack from uni.") for ent in doc_2.ents: print('{}-{}'.format(ent, ent.label_)) from spacy import displacy doc = nlp('I went to Paris where I met my old friend Jack from uni.') displacy.render(doc, style='ent', jupyter=True) ### 找到书中所有人物名字 def read_file(file_name): with open(file_name, 'r') as file: return file.read() # 加载文本数据 text = read_file('./data/pride_and_prejudice.txt') processed_text = nlp(text) sentences = [s for s in processed_text.sents] print(len(sentences)) sentences[:5] from collections import Counter, defaultdict def find_person(doc): c = Counter() for ent in processed_text.ents: if ent.label_ == 'PERSON': c[ent.lemma_] += 1 return c.most_common(10) print(find_person(processed_text)) ### 恐怖袭击分析 def read_file_to_list(file_name): with open(file_name, 'r') as file: return file.readlines() terrorism_articles = read_file_to_list('data/rand-terrorism-dataset.txt') terrorism_articles[:5] terrorism_articles_nlp = [nlp(art) for art in terrorism_articles] common_terrorist_groups = [ 'taliban', 'al - qaeda', 'hamas', 'fatah', 'plo', 'bilad al - rafidayn' ] common_locations = [ 'iraq', 'baghdad', 'kirkuk', 'mosul', 'afghanistan', 'kabul', 'basra', 'palestine', 'gaza', 'israel', 'istanbul', 'beirut', 'pakistan' ] location_entity_dict = defaultdict(Counter) for article in terrorism_articles_nlp: article_terrorist_groups = [ent.lemma_ for ent in article.ents if ent.label_ == 'PERSON' or ent.label_ == 'ORG'] # 人或者组织 article_locations = [ent.lemma_ for ent in article.ents if ent.label_ == 'GPE'] terrorist_common = [ent for ent in article_terrorist_groups if ent in common_terrorist_groups] locations_common = [ent for ent in article_locations if ent in common_locations] for found_entity in terrorist_common: for found_location in locations_common: location_entity_dict[found_entity][found_location] += 1 location_entity_dict import pandas as pd location_entity_df = pd.DataFrame.from_dict(dict(location_entity_dict), dtype=int) location_entity_df = location_entity_df.fillna(value=0).astype(int) location_entity_df import matplotlib.pyplot as plt import seaborn as sns plt.figure(figsize=(12, 10)) hmap = sns.heatmap(location_entity_df, annot=True, fmt='d', cmap='YlGnBu', cbar=False) # 添加信息 plt.title('Global Incidents by Terrorist group') plt.xticks(rotation=30) plt.show()