Python字符串处理

AI耽误的大厨

于 2024-01-26 15:41:19 发布

阅读量481

点赞数 8

CC 4.0 BY-SA版权

文章标签： python 开发语言

本文链接：https://blog.csdn.net/weixin_46556352/article/details/135864816

### Python字符串处理

#### 去掉空格或者特殊字符
input_str = '今天天气不错，今天挺风和日丽的'
input_str.strip()
input_str.rstrip()
input_str.lstrip()
input_str = 'AAA今天天气不错，挺风和日丽的AAA'
input_str.strip('A')
input_str.lstrip('A')
input_str.rstrip('A')
#### 替换操作
input_str.replace('今天','昨天')
input_str.replace('今天','')
#### 查找操作
input_str.find('今天')
#### 判断操作
input_str = '123'
input_str.isalpha()
input_str.isdigit()
#### 分割合并操作
input_str = '今天天气不错，今天挺风和日丽的'
input_str = input_str.split(' ')
input_str
''.join(input_str)
#### 帮助文档
help(str)

from nltk.tokenize import word_tokenize
from nltk.text import Text
### NLTK工具包安装
# 非常实用的文本处理工具，主要用于英文数据，历史悠久
import nltk  # pip install nltk

nltk.download()
#### 分词
from nltk.tokenize import word_tokenize
from nltk.text import Text

input_str = "Today's weather is good, very windy and sunny, we have no classes in the afternoon,We have to play basketball tomorrow."
tokens = word_tokenize(input_str)
tokens = [word.lower() for word in tokens]
tokens[:5]
#### Text对象
help(nltk.text)
# 创建一个Text对象，方便后续操作
t = Text(tokens)
t.count('good')
t.index('good')
t.plot(8)
#### 停用词
# 可以看一下说明中的介绍
from nltk.corpus import stopwords

stopwords.readme().replace('\n', ' ')
stopwords.fileids()
stopwords.raw('english').replace('\n', ' ')
test_words = [word.lower() for word in tokens]
test_words_set = set(test_words)
test_words_set.intersection(set(stopwords.words('english')))
#### 过滤掉停用词
filtered = [w for w in test_words_set if (w not in stopwords.words('english'))]
filtered
#### 词性标注
nltk.download()  # 第三个
from nltk import pos_tag

tags = pos_tag(tokens)
tags

#### 分块
from nltk.chunk import RegexpParser

sentence = [('the', 'DT'), ('little', 'JJ'), ('yellow', 'JJ'), ('dog', 'NN'), ('died', 'VBD')]
grammer = "MY_NP: {<DT>?<JJ>*<NN>}"
cp = nltk.RegexpParser(grammer)  # 生成规则
result = cp.parse(sentence)  # 进行分块
print(result)

result.draw()  # 调用matplotlib库画出来
#### 命名实体识别
nltk.download()
# maxent_ne_chunke
# words
from nltk import ne_chunk

sentence = "Edison went to Tsinghua University today."
print(ne_chunk(pos_tag(word_tokenize(sentence))))
#### 数据清洗实例
import re
from nltk.corpus import stopwords

# 输入数据
s = '    RT @Amila #Test\nTom\'s newly listed Co  &amp; Mary\'s unlisted     Group to supply tech for nlTK.\nh $TSLA $AAPL https:// t.co/x34afsfQsh'

# 指定停用词
cache_english_stopwords = stopwords.words('english')



def text_clean(text):
    print('原始数据:', text, '\n')

    # 去掉HTML标签(e.g. &amp;)
    text_no_special_entities = re.sub(r'\&\w*;|#\w*|@\w*', '', text)
    print('去掉特殊标签后的:', text_no_special_entities, '\n')

    # 去掉一些价值符号
    text_no_tickers = re.sub(r'\$\w*', '', text_no_special_entities)
    print('去掉价值符号后的:', text_no_tickers, '\n')

    # 去掉超链接
    text_no_hyperlinks = re.sub(r'https?:\/\/.*\/\w*', '', text_no_tickers)
    print('去掉超链接后的:', text_no_hyperlinks, '\n')

    # 去掉一些专门名词缩写，简单来说就是字母比较少的词
    text_no_small_words = re.sub(r'\b\w{1,2}\b', '', text_no_hyperlinks)
    print('去掉专门名词缩写后:', text_no_small_words, '\n')

    # 去掉多余的空格
    text_no_whitespace = re.sub(r'\s\s+', ' ', text_no_small_words)
    text_no_whitespace = text_no_whitespace.lstrip(' ')
    print('去掉空格后的:', text_no_whitespace, '\n')

    # 分词
    tokens = word_tokenize(text_no_whitespace)
    print('分词结果:', tokens, '\n')

    # 去停用词
    list_no_stopwords = [i for i in tokens if i not in cache_english_stopwords]
    print('去停用词后结果:', list_no_stopwords, '\n')
    # 过滤后结果
    text_filtered = ' '.join(list_no_stopwords)  # ''.join() would join without spaces between words.
    print('过滤后:', text_filtered)


text_clean(s)

# [spaCy](http://spacy.io/docs/#examples) 介绍
# 导入工具包和英文模型
# python -m spacy download en 用管理员身份打开CMD

import spacy

nlp = spacy.load('en_core_web_sm')
### 文本处理
doc = nlp('Weather is good, very windy and sunny. We have no classes in the afternoon.')
# 分词
for token in doc:
    print(token)
# 分句
for sent in doc.sents:
    print(sent)
### 词性


for token in doc:
    print('{}-{}'.format(token, token.pos_))
## 命名体识别
doc_2 = nlp("I went to Paris where I met my old friend Jack from uni.")
for ent in doc_2.ents:
    print('{}-{}'.format(ent, ent.label_))
from spacy import displacy

doc = nlp('I went to Paris where I met my old friend Jack from uni.')
displacy.render(doc, style='ent', jupyter=True)


### 找到书中所有人物名字
def read_file(file_name):
    with open(file_name, 'r') as file:
        return file.read()


# 加载文本数据
text = read_file('./data/pride_and_prejudice.txt')
processed_text = nlp(text)
sentences = [s for s in processed_text.sents]
print(len(sentences))
sentences[:5]
from collections import Counter, defaultdict


def find_person(doc):
    c = Counter()
    for ent in processed_text.ents:
        if ent.label_ == 'PERSON':
            c[ent.lemma_] += 1
    return c.most_common(10)


print(find_person(processed_text))


### 恐怖袭击分析
def read_file_to_list(file_name):
    with open(file_name, 'r') as file:
        return file.readlines()


terrorism_articles = read_file_to_list('data/rand-terrorism-dataset.txt')
terrorism_articles[:5]
terrorism_articles_nlp = [nlp(art) for art in terrorism_articles]
common_terrorist_groups = [
    'taliban',
    'al - qaeda',
    'hamas',
    'fatah',
    'plo',
    'bilad al - rafidayn'
]

common_locations = [
    'iraq',
    'baghdad',
    'kirkuk',
    'mosul',
    'afghanistan',
    'kabul',
    'basra',
    'palestine',
    'gaza',
    'israel',
    'istanbul',
    'beirut',
    'pakistan'
]
location_entity_dict = defaultdict(Counter)

for article in terrorism_articles_nlp:

    article_terrorist_groups = [ent.lemma_ for ent in article.ents if
                                ent.label_ == 'PERSON' or ent.label_ == 'ORG']  # 人或者组织
    article_locations = [ent.lemma_ for ent in article.ents if ent.label_ == 'GPE']
    terrorist_common = [ent for ent in article_terrorist_groups if ent in common_terrorist_groups]
    locations_common = [ent for ent in article_locations if ent in common_locations]

    for found_entity in terrorist_common:
        for found_location in locations_common:
            location_entity_dict[found_entity][found_location] += 1

location_entity_dict
import pandas as pd

location_entity_df = pd.DataFrame.from_dict(dict(location_entity_dict), dtype=int)
location_entity_df = location_entity_df.fillna(value=0).astype(int)
location_entity_df
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 10))
hmap = sns.heatmap(location_entity_df, annot=True, fmt='d', cmap='YlGnBu', cbar=False)

# 添加信息
plt.title('Global Incidents by Terrorist group')
plt.xticks(rotation=30)
plt.show()