1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
| import requests import re import wget import os from pathlib import Path
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36', 'Cookie': 'PHPSESSID=69fk4bobxxxxxxxxbjm20; XXZ_ZLK_admin_username=username', } url = 'https://www.xxxxx.cn/admin.html' session = requests.Session() list_urls = [] path = 'E:\\xx要的全部测试' for page in range(7, 8): print('************************************正在爬取第{}*********************************'.format(page)) Target_url = 'https://www.xxxxx.cn/admin/means/index.html?page={}'.format(page) response = session.get(Target_url, headers=headers) print('页面url==='+str(Target_url)+'状态码'+str(response.status_code))
content = response.text urls = re.findall(r"<a.*?href=.*?<\/a>", content, re.I | re.S | re.M) keywords = "https://yyy.xxxxx.cn/means/file/" r_url = 0 for url in urls: if keywords in url: r_url += 1 print('正在写入第{}'.format(r_url)) list_urls.append(url)
n_urls = 0 for i in list_urls: file_url_res = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')" file_url = re.findall(file_url_res, i, re.I | re.S | re.M) file_name_res = r"(?<=>).*?(?=<)" file_name = re.findall(file_name_res, i, re.I | re.S | re.M) n_urls += 1 print("开始下载第" + str(n_urls) + "条") filename = wget.download(file_url[0], path) filename_suffix = os.path.splitext(filename)[-1] try: os.rename(filename, path + '/' + file_name[0] + filename_suffix) except: pass
|