логика такая.. остальное сам
import os
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse, urljoin
def extract_image_urls(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
img_urls = []
for img_tag in soup.find_all('img'):
src = img_tag.get('src')
if src:
img_urls.append(src)
return img_urls
def download_images(img_urls, output_folder):
if not os.path.exists(output_folder):
os.makedirs(output_folder)
for img_url in img_urls:
img_data = requests.get(img_url).content
img_name = os.path.join(output_folder, os.path.basename(urlparse(img_url).path))
with open(img_name, 'wb') as img_file:
img_file.write(img_data)
def process_html_file(file_path, output_folder):
with open(file_path, 'r', encoding='utf-8') as html_file:
html_content = html_file.read()
img_urls = extract_image_urls(html_content)
download_images(img_urls, output_folder)
def process_archive(zip_path, output_folder):
import zipfile
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(output_folder)
for root, _, files in os.walk(output_folder):
for file in files:
if file.endswith('.html'):
html_file_path = os.path.join(root, file)
process_html_file(html_file_path, output_folder)
# Пример использования
archive_path = 'путь_к_вашему_архиву.zip'
output_folder = 'папка_для_сохранения_изображений'
process_archive(archive_path, output_folder)