简介
在今天的数字时代,网络上的信息量庞大,但有时我们需要从多个网站中收集数据。在这篇博客中,我们将介绍如何使用Python编写一个网络爬虫程序,以自动从多个网站中抓取数据,并将结果保存到文件中。我们将使用一个名为SearxCheckList的示例程序来演示这个过程。
第一步:准备工作
在开始编写网络爬虫程序之前,我们需要做一些准备工作。首先,确保你已经安装了Python和所需的依赖库。我们将使用requests库来发送HTTP请求,re库来处理正则表达式,以及socks库来设置代理服务器。你可以使用pip来安装这些库:
pip install requests
pip install pysocks
第二步:编写爬虫程序
接下来,我们将编写一个Python程序来执行网络爬虫任务。以下是示例程序的主要部分:
推荐使用本程序,实时有1000+ SearX节点,嘎嘎好用!请勿用于非法用途,仅用于学习。
import re
import socks
import requests
import urllib.parse
import json
import sys
import time
print('''
__ __ _____ ____ ____ _
| \/ |_ _| ___| _ \ / ___| / \ ___ _ __
| |\/| | | | | |_ | |_) | | _ / _ \ / __| '_ \
| | | | |_| | _| | __/| |_| |/ ___ \ | (__| | | |
|_| |_|\__, |_| |_| \____/_/ \_(_)___|_| |_|
|___/
SearX search tool {1.7.1#main}
''')
def modify_url(base_url, param1, param2, page):
modified_url = base_url.replace('test', f'{param1}_{param2}') + f'&pageno={page}'
return modified_url
def fetch_retVal(url, headers, timeout=10): # 设置超时时间,默认为10秒
try:
response = requests.get(url, headers=headers, timeout=timeout)
if response.status_code == 200:
page_content = response.text
regex = r'<a href="([^"]+)" class="url_wrapper"'
extracted_urls = [urllib.parse.unquote(match.group(1).replace("&", "&")) for match in re.finditer(regex, page_content, re.I | re.S)]
if len(extracted_urls) < 2:
return None
else:
return extracted_urls
except requests.exceptions.RequestException as e:
print(f'ERROR: 请求超时 - {str(e)}')
return None
# 请求头信息
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Dnt": "1",
"Pragma": "no-cache",
"Sec-Ch-Ua": '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
# 获取命令行参数
if len(sys.argv) == 2:
param1 = sys.argv[1]
param2 = "" # 如果只提供一个参数,则将param2设置为空字符串
elif len(sys.argv) == 3:
param1 = sys.argv[1]
param2 = sys.argv[2]
else:
print("Usage:python script.py <param1> [<param2>]")
sys.exit(1)
# 获取域名列表
url_domain_list = 'https://data.myfpga.cn/searx.txt'
try:
response = requests.get(url_domain_list, headers=headers)
if response.status_code == 200:
domain_names = response.text.split('\n')
else:
print("ERROR:无法获取域名列表,请联系管理员")
sys.exit(1)
except Exception as e:
print(f"ERROR:无法获取域名列表:{str(e)}")
sys.exit(1)
results = [] # 存储所有页面的结果
found_result = False # 标志变量,用于表示是否已经找到一个可用的域名
for domain in domain_names:
domain = domain.strip() # 去除域名两端的空格
if not domain:
continue # 忽略空白域名
print(f'INFO:尝试更换域名为 {domain}')
if found_result:
break # 如果已经找到一个可用的域名,则停止搜索其他域名
page = 1
url = modify_url(f'{domain}/search?q=test&language=auto&time_range=&safesearch=0&theme=simple', param1, param2, page)
# 获取 retVal
try:
retVal = fetch_retVal(url, headers)
except Exception as e:
print(f'ERROR:确认域名 {domain} 不可用 - {str(e)}')
if retVal is not None:
found_result = True # 找到可用的域名,将标志变量设置为True
while retVal:
results.extend(retVal) # 将结果添加到结果列表
page += 1
url = modify_url(f'https://{domain}/search?q=test&language=auto&time_range=&safesearch=0&theme=simple', param1, param2, page)
print(f"INFO:正在获取第{page}页数据")
retVal = fetch_retVal(url, headers)
if found_result:
# 输出结果为JSON格式
result_json = json.dumps(results, ensure_ascii=False, indent=4)
print(result_json)
# 保存结果到文件
timestamp = int(time.time())
output_filename = f'SearxCheckList_{timestamp}.txt'
with open(output_filename, 'w') as output_file:
for result in results:
output_file.write(result + '\n')
print(f"INFO:结果已保存到文件:{output_filename}")
else:
print("ERROR:程序无法获取结果,请联系管理员")
以下是本地searx.txt的版本,
import re
import socks
import requests
import urllib.parse
import json
import sys
import time
print('''
__ __ _____ ____ ____ _
| \/ |_ _| ___| _ \ / ___| / \ ___ _ __
| |\/| | | | | |_ | |_) | | _ / _ \ / __| '_ \
| | | | |_| | _| | __/| |_| |/ ___ \ | (__| | | |
|_| |_|\__, |_| |_| \____/_/ \_(_)___|_| |_|
|___/
SearX search tool {1.6.0#main}
''')
def modify_url(base_url, param1, param2, page):
modified_url = base_url.replace('test', f'{param1}_{param2}') + f'&pageno={page}'
return modified_url
def extract_domain_names(filename):
domain_names = []
with open(filename, 'r') as file:
for line in file:
domain_names.append(line.strip())
return domain_names
def fetch_retVal(url, headers):
response = requests.get(url, headers=headers)
if response.status_code == 200:
page_content = response.text
regex = r'<a href="([^"]+)" class="url_wrapper"'
extracted_urls = [urllib.parse.unquote(match.group(1).replace("&", "&")) for match in re.finditer(regex, page_content, re.I | re.S)]
if len(extracted_urls) < 2:
return None
else:
return extracted_urls
else:
return None
# 请求头信息
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Dnt": "1",
"Pragma": "no-cache",
"Sec-Ch-Ua": '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
"Sec-Ch-Ua-Mobile": "?0",
"Sec-Ch-Ua-Platform": '"Windows"',
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}
# 获取命令行参数
if len(sys.argv) != 3:
print("Usage: python script.py <param1> <param2>")
sys.exit(1)
param1 = sys.argv[1]
param2 = sys.argv[2]
# 从文件提取域名列表
domain_names = extract_domain_names('searx.txt')
results = [] # 存储所有页面的结果
found_result = False # 标志变量,用于表示是否已经找到一个可用的域名
for domain in domain_names:
print("域名不可用,正在更换域名")
if found_result:
break # 如果已经找到一个可用的域名,则停止搜索其他域名
page = 1
url = modify_url(f'https://{domain}/search?q=test&language=auto&time_range=&safesearch=0&theme=simple', param1, param2, page)
# 获取 retVal
try:
retVal = fetch_retVal(url, headers)
except:
print("域名访问错误,正在更换")
if retVal is not None:
found_result = True # 找到可用的域名,将标志变量设置为True
while retVal:
results.extend(retVal) # 将结果添加到结果列表
page += 1
url = modify_url(f'https://{domain}/search?q=test&language=auto&time_range=&safesearch=0&theme=simple', param1, param2, page)
retVal = fetch_retVal(url, headers)
if found_result:
# 输出结果为JSON格式
result_json = json.dumps(results, ensure_ascii=False, indent=4)
print(result_json)
# 保存结果到文件
timestamp = int(time.time())
output_filename = f'SearxCheckList_{timestamp}.txt'
with open(output_filename, 'w') as output_file:
for result in results:
output_file.write(result + '\n')
print(f"结果已保存到文件: {output_filename}")
else:
print("无法获取页面内容")
这个程序的主要功能包括:
第三步:运行程序
现在,我们已经完成了编写程序的工作,可以运行程序来收集数据。在命令行中执行以下命令:
python script.py <param1> <param2>
请替换<param1>和<param2>为你的实际参数。之所以设置两个参数,这是为了配合Sql搜集,懂得都懂。
总结
通过这个示例程序,我们展示了如何使用Python编写一个简单的网络爬虫程序,以从多个网站中抓取数据,并将结果保存到文件中。这是一个强大的工具,可用于许多实际应用,如数据采集、分析和报告生成。希望这篇博客能帮助你开始自己的网络爬虫项目!