自然语言处理：NLP数据清洗（re库、nltk库的使用）

原创已于 2024-01-27 21:42:53 修改 · 1.1k 阅读

7 ·

CC 4.0 BY-SA版权

文章标签：

#python #自然语言处理 #nlp #word2vec

于 2024-01-27 13:44:01 首次发布

本文介绍了在自然语言处理(NLP)和程序语言处理(PLP)项目中，数据清洗的关键步骤，特别是在英文文本处理中，如何使用nltk库进行分词、停用词筛选、词性标注和词形还原，以及如何清洗代码文本以提高数据质量和模型性能。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

文本（自然语言/程序语言）处理项目中，数据在进入模型之前往往要经过一些处理步骤，使得原始数据更加符合模型的需求、更有利于特征提取。具体来说，数据清洗主要目的是去除文本中的无用信息、修正文本中的错误、统一格式等，以提高数据质量和可用性、减小噪声。对于NLP/PLP来说，数据清洗是非常重要的步骤。

下面以代码数据和英文自然语言token数据的清洗为例，说说数据清洗的主要步骤： 在英文的自然语言处理中，常用到这样一个库nltk。当需要对长文本进行序列化的时候，可以用它来分词；当需要筛出列表中几乎不包含语义信息的the/in类似这种停用词时，可以调用它的停用词词库；进行序列识别的时候可以用它识别词性；制作词典的时候可以用它来进行词性还原或词干提取。nltk配合re库使用可以取得不错的效果。

以下面几个库的使用为例，nltk库的导入：

from nltk import word_tokenize as wt     # 分词库
from nltk.stem import WordNetLemmatizer  # 词形还原库
from nltk.corpus import stopwords        # 停用词库
from nltk.tag import pos_tag             # 词性识别库
# import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

代码文本的清洗：使用re库去注释

def clean_code(codes):
    code = codes
    # 删除单行注释
    code = re.sub(r'//.*$', '', code)
    code = re.sub(r'@.*$', '', code)
    # 匹配并删除"/**/"之间的内容
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)  # flags=re.DOTALL匹配文本中所有字符
    # 删除多余文本
    code = re.sub(r'Nonnull|Nullable', '', code, flags=re.MULTILINE)  # flags=re.MULTILINE为多行匹配
    code = re.sub(r'{(\w+)\s+public', '{public', code, flags=re.MULTILINE)
    code = re.sub(r'{(\w+)\s+private', '{private', code, flags=re.MULTILINE)
    code = re.sub(r'{(\w+)\s+protected', '{protected', code, flags=re.MULTILINE)
    code = re.sub(r'(\w+)\s+final', 'final', code, flags=re.MULTILINE)
    code = re.sub(r'(\w+)\s+default', 'default', code, flags=re.MULTILINE)
    # 单引号替换为双引号
    code = re.sub(r"'", '"', code)
    # 删除特殊符号
    code = re.sub(r'[@#$&%~`\\……|]', '', code, flags=re.MULTILINE)
    # 删除换行、多余空格和多余括号
    code = re.sub(r'\s+', ' ', code, flags=re.MULTILINE)
    code = re.sub(r'\(\s+', '(', code, flags=re.MULTILINE)
    code = re.sub(r'\s+\)', ')', code, flags=re.MULTILINE)
    code = re.sub(r'\{\s+', '{', code, flags=re.MULTILINE)
    code = re.sub(r'\s+}', '}', code, flags=re.MULTILINE)
    code = code.strip()
    return code

token的清洗：拆分合成词、词形还原、删除停用词，目的是为了保证最后尽可能小的词典，涵盖整个token数据集中尽可能多的词，这样序列化之后有利于模型的学习

# 拆分驼峰命名或下划线连接的合成词
def camel_case_split_underscore(identifier): # identifier是一个字符串
    # 使用正则表达式匹配驼峰命名的单词，然后再拿去作下划线分割，返回最终结果
    words = re.findall(r'[a-z]+|[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', identifier) # 驼峰匹配
    if len(words) <= 0:
        low_word = [identifier]
    else:
        low_word = words
    components = []
    for i, j in enumerate(low_word):
        identifiers = j.split('_') # 下划线分割
        for identifier in identifiers:
            components.append(to_lower(identifier)) # 变小写
    return components

# 清洗token 
def clean_token(tokens_list): # tokens_list是字符串列表的列表
    cleaned_token_list = []
    aft_lem = []
    output_token_list = []
    # 停用词几乎不包含语义信息
    stop_words = stopwords.words('english')
    wnl = nltk.stem.WordNetLemmatizer()
    # for循环清洗
    for cleaned_tokens in tokens_list:
        # 过滤特殊字符
        if re.match(r'^[a-zA-Z_0-9-+=%^|&*!]+$', cleaned_tokens):
            cleaned_token_list.append(cleaned_tokens)
        for voc_key in cleaned_token_list:  # voc_key 是一个词
            # 过滤长度小于等于1的字母
            if len(voc_key) <= 1 and voc_key.isalnum():
                continue
            else:
                # 拆分驼峰命名或下划线连接的合成词
                temp_tokens = camel_case_split_underscore(voc_key)
                nltk_pos_tagged = nltk.pos_tag(temp_tokens)
                # 识别词性，根据词性进行相应的词形还原
                for pstg in nltk_pos_tagged:
                    word, tag = pstg
                    if tag.startswith('NN'):
                        aft_lem.append(wnl.lemmatize(word, 'n')) # 还原名词为单数形式
                    elif tag.startswith('JJ'):
                        aft_lem.append(wnl.lemmatize(word, 'a')) # 还原形容词为原形
                    elif tag.startswith('VB'):
                        aft_lem.append(wnl.lemmatize(word, 'v')) # 还原动词为动词原形
                    elif tag.startswith('RB'):
                        aft_lem.append(wnl.lemmatize(word, 'r')) # 还原副词为原形
                    else:
                        aft_lem.append(word)
    # 删除停用词
    for aft in aft_lem:
        if aft not in stop_words:
            if len(aft) <= 1 and aft.isalnum():
                continue
            else:
                output_token_list.append(aft.replace('===', '==').replace('!==', '!='))
    return output_token_list