(原创)联合网页图片爬虫和PaddlePaddle,对图片进行爬取并分类
#首先是Python语言的测试代码,如需服务端部署,请见文末。 import os import time import argparse import requests import re import io from urllib.parse import urljoin from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor, as_completed from PIL import Image import cv2 from shutil import copyfile import numpy as np import paddlex as pdx import importlib import sys importlib.reload(sys) model = pdx.load_model('./inference_model') # Paddle加载模型 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36' } img_regex = re.compile(r'(http|https)?://[^\s]*\.(jpg|jpeg|png|gif|bmp)') # 定义函数,用于获取网站中的所有图片链接和页面链接 def get_links(url, timeout=10): try: response = requests.get(url, headers=headers, timeout=timeout, verify=False) response.raise_for_status() # 检查响应状态码,如果不是 200,抛出异常 except requests.exceptions.RequestException as e: print(f"请求 {url} 时出错:{e}") return ([], []) html = response.text soup = BeautifulSoup(html, 'html.parser') img_links = [img.get('src') for img in soup.find_all('img')] page_links = [a.get('href') for a in soup.find_all('a', href=True)] return (img_links, page_links) # 定义函数,用于下载图片 def download_img(img_url, save_path, timeout=10): try: img_name = os.path.basename(img_url) img_data = requests.get(img_url, headers=headers, timeout=timeout, verify=False).content except requests.exceptions.RequestException as e: print(f"下载 {img_url} 时出错:{e}") return # 校验图片是否完整 if not is_valid_image(img_data): print(f"下载 {img_url} 时出错:图片不完整或者损坏") return # 获取图片尺寸 img = Image.open(io.BytesIO(img_data)) width, height = img.size # 过滤掉尺寸小于 224x224 的图片 if width < 224 or height < 224: return # 保存图片 with open(os.path.join(save_path, img_name), 'wb') as f: f.write(img_data) # 预测图片 result_path = "./PredictImg" try: im = cv2.imdecode(np.fromfile(os.path.join(save_path, img_name), dtype=np.uint8), -1) im = im.astype('float32') result = model.predict(im) isExists = os.path.exists(result_path + '/' + result[0]['category']) # 判断分类文件夹是否存在 if not isExists: os.makedirs(result_path + '/' + result[0]['category']) if result[0]['score'] > 0.9: copyfile(os.path.join(save_path, img_name), result_path + '/' + result[0]['category'] + '/' + img_name) print('OK:' + os.path.join(result_path, img_name)) os.remove(os.path.join(save_path, img_name)) except: print('ERROR:' + os.path.join(save_path, img_name)) # 定义函数,用于校验图片是否完整 def is_valid_image(img_data): try: Image.open(io.BytesIO(img_data)).verify() return True except: return False # 定义函数,用于下载所有页面的图片 def download_all_images(url, save_path, max_depth=3, delay=0.5, timeout=10): visited_links = set() # 用集合来保存已经访问过的链接 download_queue = [(url, 0)] # 用队列来保存待下载的链接和深度 page_count = 0 # 记录已经成功访问的页面数量 img_count = 0 # 记录已经成功下载的图片数量 while download_queue: url, depth = download_queue.pop(0) if depth > max_depth: continue if url in visited_links: continue try: response = requests.get(url, headers=headers, timeout=timeout, verify=False) response.raise_for_status() # 检查响应状态码,如果不是 200,抛出异常 html = response.text soup = BeautifulSoup(html, 'html.parser') img_links = [ img.get('src') for img in soup.find_all('img') if img.get('src') and img_regex.match(img.get('src')) ] except Exception as e: print(f"访问 {url} 时出错:{e}") continue # 下载当前页面的所有图片 with ThreadPoolExecutor(max_workers=10) as executor: futures = [] for img_url in img_links: if not img_url.startswith('http'): img_url = urljoin(url, img_url) try: download_img(img_url, save_path, timeout=timeout) img_count += 1 except requests.exceptions.RequestException: download_img(img_url, save_path, timeout=timeout) img_count += 1 futures.append( executor.submit(download_img, img_url, save_path, timeout=timeout)) for future in as_completed(futures): if future.exception() is not None: print(f"下载图片时出错:{future.exception()}") # 将当前页面中的所有链接加入待下载队列 for page_link in set(get_links(url)[1]): if not page_link.startswith('http'): page_link = urljoin(url, page_link) if page_link not in visited_links: download_queue.append((page_link, depth + 1)) visited_links.add(url) page_count += 1 print(f"已成功访问 {page_count} 个页面,已成功下载 {img_count} 张图片") # 暂停一段时间,防止访问过快被封 IP time.sleep(delay) # 定义函数,用于从 txt 文件中读取要下载图片的网站链接 def read_urls_from_file(file_path): urls = [] try: with open(file_path, 'r') as file: urls = [line.strip() for line in file] except Exception as e: print(f"读取文件 {file_path} 时出错:{e}") return urls if __name__ == '__main__': parser = argparse.ArgumentParser(description='Download Images from URLs') parser.add_argument('--url_file', type=str, help='Path to the file containing the URLs', required=True) parser.add_argument('--save_path', type=str, help='Path to save the downloaded images', default='./Images') parser.add_argument('--max_depth', type=int, help='Maximum depth for crawling', default=3) args = parser.parse_args() urls = read_urls_from_file(args.url_file) if not os.path.exists(args.save_path): os.makedirs(args.save_path) for url in urls: download_all_images(url, args.save_path, max_depth=args.max_depth)
这段代码是一个用于从给定网站链接下载图片的程序。它使用Python的requests库发送HTTP请求获取网页内容,并使用BeautifulSoup库解析HTML内容以获取图片链接和页面链接。
代码的主要逻辑如下:
导入所需的库和模块。
定义全局变量和常量,包括请求头部信息、图片链接的正则表达式等。
定义了几个函数:
get_links(url, timeout):用于从给定的网页中获取所有图片链接和页面链接。
download_img(img_url, save_path, timeout):用于下载图片并进行预测分类。
is_valid_image(img_data):用于检查图片是否完整。
download_all_images(url, save_path, max_depth, delay, timeout):用于递归地下载指定深度内的所有页面中的图片。
read_urls_from_file(file_path):从文本文件中读取要下载的网站链接。
在if __name__ == '__main__':语句块中,使用argparse库解析命令行参数,包括指定保存路径、最大深度等参数。
从指定的文本文件中读取要下载的网站链接。
遍历每个网站链接,调用download_all_images函数下载图片,并保存到指定的路径中。
这段代码利用多线程的方式下载图片,通过解析HTML内容获取图片链接和页面链接,并使用PaddleX库加载预训练的模型进行图片分类预测。它还包含一些错误处理机制,例如处理请求异常、校验图片完整性等。最后,代码通过设置延迟时间来控制访问速度,以避免被封IP。
当然,你也可以在服务端部署,以下是PHP代码,代码尚处测试阶段,仅供参考
<?php use GuzzleHttp\Client; use GuzzleHttp\Exception\RequestException; use GuzzleHttp\Pool; use GuzzleHttp\Psr7\Request; use GuzzleHttp\Psr7\Response; use Symfony\Component\DomCrawler\Crawler; require 'vendor/autoload.php'; $client = new Client(); $headers = [ 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36' ]; $img_regex = '/(http|https)?:\/\/[^\s]*\.(jpg|jpeg|png|gif|bmp)/i'; // Define a function to get all image links and page links from a website function getLinks($url, $timeout = 10) { try { $client = new GuzzleHttp\Client(); $response = $client->get($url, ['headers' => $headers, 'timeout' => $timeout]); $html = $response->getBody()->getContents(); } catch (RequestException $e) { echo "请求 $url 时出错:" . $e->getMessage(); return [[], []]; } $crawler = new Crawler($html); $imgLinks = $crawler->filter('img')->each(function ($node) { return $node->attr('src'); }); $pageLinks = $crawler->filter('a[href]')->each(function ($node) { return $node->attr('href'); }); return [$imgLinks, $pageLinks]; } // Define a function to download an image function downloadImg($imgUrl, $savePath, $timeout = 10) { try { $imgName = basename($imgUrl); $response = $client->get($imgUrl, ['headers' => $headers, 'timeout' => $timeout]); $imgData = $response->getBody()->getContents(); } catch (RequestException $e) { echo "下载 $imgUrl 时出错:" . $e->getMessage(); return; } // Check if the image is valid if (!isValidImage($imgData)) { echo "下载 $imgUrl 时出错:图片不完整或损坏"; return; } // Save the image file_put_contents($savePath . '/' . $imgName, $imgData); // Predict the image $resultPath = './PredictImg'; try { $im = imagecreatefromstring($imgData); imagealphablending($im, true); imagesavealpha($im, true); imagepng($im, $savePath . '/' . $imgName); imagedestroy($im); $im = imagecreatefrompng($savePath . '/' . $imgName); $result = modelPredict($im); $isExists = is_dir($resultPath . '/' . $result[0]['category']); if (!$isExists) { mkdir($resultPath . '/' . $result[0]['category'], 0755, true); } if ($result[0]['score'] > 0.9) { copy($savePath . '/' . $imgName, $resultPath . '/' . $result[0]['category'] . '/' . $imgName); echo 'OK: ' . $resultPath . '/' . $imgName . "\n"; unlink($savePath . '/' . $imgName); } } catch (\Throwable $th) { echo 'ERROR: ' . $savePath . '/' . $imgName . "\n"; } } // Define a function to check if an image is valid function isValidImage($imgData) { try { $im = imagecreatefromstring($imgData); if ($im !== false) { imagedestroy($im); return true; } else { return false; } } catch (\Throwable $th) { return false; } } // Define a function to download all images from all pages function downloadAllImages($url, $savePath, $maxDepth = 3, $delay = 0.5, $timeout = 10) { $visitedLinks = []; $downloadQueue = [[$url, 0]]; $pageCount = 0; $imgCount = 0; while (!empty($downloadQueue)) { [$url, $depth] = array_shift($downloadQueue); if ($depth > $maxDepth) { continue; } if (in_array($url, $visitedLinks)) { continue; } try { $response = $client->get($url, ['headers' => $headers, 'timeout' => $timeout]); $html = $response->getBody()->getContents(); } catch (RequestException $e) { echo "访问 $url 时出错:" . $e->getMessage(); continue; } $crawler = new Crawler($html); $imgLinks = $crawler->filter('img')->each(function ($node) { $imgSrc = $node->attr('src'); if (preg_match($img_regex, $imgSrc)) { return $imgSrc; } }); // Download images from the current page $pool = new Pool($client, $imgLinks, [ 'fulfilled' => function ($response, $index) use ($savePath, &$imgCount) { $imgUrl = $response->getBody()->getContents(); downloadImg($imgUrl, $savePath); $imgCount++; }, 'rejected' => function ($reason, $index) use ($savePath) { $imgUrl = $reason->getRequest()->getUri(); downloadImg($imgUrl, $savePath); }, ]); $promise = $pool->promise(); $promise->wait(); // Get all page links from the current page $pageLinks = $crawler->filter('a[href]')->each(function ($node) { $pageLink = $node->attr('href'); if (preg_match($img_regex, $pageLink)) { return $pageLink; } }); // Add the page links to the download queue foreach ($pageLinks as $pageLink) { if (!empty($pageLink) && !in_array($pageLink, $visitedLinks)) { $downloadQueue[] = [$pageLink, $depth + 1]; } } $visitedLinks[] = $url; $pageCount++; echo "已成功访问 $pageCount 个页面,已成功下载 $imgCount 张图片\n"; // Pause for a while to avoid being blocked usleep($delay * 1000000); } } // Define a function to read URLs from a file function readUrlsFromFile($filePath) { $urls = []; try { $file = fopen($filePath, 'r'); if ($file) { while (($line = fgets($file)) !== false) { $urls[] = trim($line); } fclose($file); } } catch (\Throwable $th) { echo "读取文件 $filePath 时出错:" . $th->getMessage(); } return $urls; } // Main code $urlFile = isset($argv[1]) ? $argv[1] : null; $savePath = './Images'; $maxDepth = 3; if (empty($urlFile)) { echo "请提供包含URL的文件路径\n"; exit(1); } $urls = readUrlsFromFile($urlFile); if (!is_dir($savePath)) { mkdir($savePath, 0755, true); } foreach ($urls as $url) { downloadAllImages($url, $savePath, $maxDepth); } ?>