python字符串，脚本，文件操作，爬虫-CSDN博客

>>> x = "测试"
>>> x.encode("utf-8") //对数据进行编码（encode）
b'\xe6\xb5\x8b\xe8\xaf\x95'
>>> y = b'\xe6\xb5\x8b\xe8\xaf\x95'
>>> y.decode("utf-8") //对数据进行解码（decode），乱码用decode
'测试'

>>> a = "yk-001"
>>> a.startswith("yk") //以xx开头
True
>>> b = "xxxxx@163.com"
>>> b.endswith("163.c0m")
False
>>> b.endswith("163.com") //以xx结尾
True

模糊搜索文件

在 E:\yunjisuan\python_3_10_9 目录下寻找不是exe类型并且文件名里有python的文件

import os
files = "C:/python_3_10_9" #注意路径要把“\”改成“/”,此处写的是绝对路径
path = os.listdir(files)

for i in path:
    if (not i.endswith(".exe")) and "python" in i :
        print(i)

文件自动分类

import os
import shutil

path = "./"
file = os.listdir(path)

for i in file:
    flodername = "./" + i.split(".")[-1] 
'''
文件用“.”分割，split后，变成["xxx",".txt"]，取后面的“.txt”
'''
    if not os.path.exists(flodername): #如果不能在路径中找到以i文件类型为名的文件夹，则新建一个            
                                       #文件夹，并将文件移动到此文件夹
        os.makedirs(flodername)
        shutil.move(i,flodername)
    else:
        shutil.move(i,flodername)

实现文件归类
使用Python进行这样的操作:
1.把jpg,png,gif 文件夹中的所有文件移动到image文件夹中，然后删除jpg,png,gif 文件夹
2.把 doc,md,ppt文件夹中的所有文件移动到document文件夹中，然后删除

import os
import shutil
path = "E:/pycharmcodes/pythonProject1/doc"
file = os.listdir(path)
os.makedirs(path + '/image')
os.makedirs(path + '/document')
imagel = ['doc','gif','jpg']
docl = ['md','png','ppt']

for i in imagel:
    path2 = path + '/' + i
    file2 = os.listdir(path2)
    for f in file2:
        shutil.move(path2 + '/' + f,path + '/image')
    os.removedirs(path2)

for i in docl:
    path3 = path + '/' + i
    file3 = os.listdir(path3)
    for f in file3:
        shutil.move(path3 + '/' + f , path + '/document')
    os.removedirs(path3)

文件操作

f = open("test.txt","w") #如果文件存在则打开，不存在就自动新建
f.write("xxxxxxxxxxx")
f.close() #打开后一定要关闭

f = open("test.txt","r")
content = f.read(5) #读取文件前五个字符，每执行一次，向后读5个字符
content = f.readlines() #读取文件全部内容，存为列表
content = f.readline() #读取文件一行字符，每执行一次，向后读一行
print(content)
f.close()

爬虫

import requests
if __name__ == "__main__":
    #1.指定url
    url = 'https://www.sogou.com/'

    #2.发起请求
    #get方法会返回一个响应对象
    response = requests.get(url=url)

    #3.获取响应数据 .text返回的是字符串形式的数据
    page_text = response.text
    print(page_text)

    #4.持久化数据
    with open('./sogoutest.html','w',encoding='utf-8') as fp:
        fp.write(page_text)
    print('爬取结束')

UA:User-Agent(请求载体的身份标识)
UA伪装:门户网站的服务器会检测发送请求的载体身份标识，如果检测到请求的载体身份标识为某款浏览器说明该请求是正常的请求，如果检测到请求的载体身份标识不是基于某款浏览器则表示该
请求为不正常的请求(爬虫)则服器端就很有可能拒绝该次请求。

搜狗自定义搜索内容

import requests
if __name__ == "__main__":
    #UA伪装：将对应的User-Agent封装到一个字典中
    header = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
    }
    url = 'https://www.sogou.com/web'
    #处理url携带的参数：封装到字典中
    kw = input('输入想搜索的数据：')
    param = {
        'query':kw
    }
    reponse = requests.get(url=url,params=param,headers=header)
    html_text = reponse.text
    fileName = kw+'.html'
    with open(fileName,'w',encoding='utf-8') as test:
        test.write(html_text)
    print(kw+'搜索保存结束')

百度翻译内容保存到本地

可以在调试工具---network---Response Headers---Connect-Type中看到返回的是什么类型的数据

import requests
import json
if __name__ == "__main__":
    post_url = 'https://fanyi.baidu.com/sug'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
    }
    word = input('输入：')
    data = {
        'kw':word
    }
    response = requests.post(url=post_url,data=data,headers=headers)
    obj = response.json() #因为post的数据是json类型，所以用json()

    fileName = word+'.json'
    fp = open(fileName,'w',encoding='utf-8')
    json.dump(obj,fp=fp,ensure_ascii=False)
    print('OK')