team-2/test2_with_training/scripts/download_images.py

56 lines
1.8 KiB
Python
Raw Normal View History

# scripts/download_images.py
import os
import logging
from icrawler.builtin import BingImageCrawler
from icrawler import ImageDownloader
import requests
from requests.exceptions import RequestException
# 1. Configura il logger per vedere solo warning+
logging.getLogger('icrawler').setLevel(logging.WARNING)
class ValidatingDownloader(ImageDownloader):
"""
Override del downloader: prima di scrivere, verifica con HEAD che
l'URL sia effettivamente un'immagine e non troppo grande.
"""
def download(self, task, default_ext, timeout=10, **kwargs):
url = task['file_url']
try:
# HEAD per controllare content-type e lunghezza
head = requests.head(url, timeout=timeout)
ctype = head.headers.get('Content-Type', '')
clen = int(head.headers.get('Content-Length', 0))
# Filtro: solo jpeg/png e max 5MB
if ('image' not in ctype) or (clen > 5_000_000):
return False
except RequestException:
return False
# Se OK, chiama il downloader originale
return super().download(task, default_ext, timeout=timeout, **kwargs)
def download_images(query, max_num, output_dir):
os.makedirs(output_dir, exist_ok=True)
crawler = BingImageCrawler(
feeder_threads=1,
parser_threads=1,
downloader_threads=2, # pochi thread per stabilità
storage={'root_dir': output_dir},
downloader_cls=ValidatingDownloader
)
crawler.crawl(
keyword=query,
max_num=max_num,
min_size=(200, 200),
max_size=None,
file_idx_offset=0
)
if __name__ == "__main__":
# Scarica immagini di basilico e pomodoro
download_images("basil plant", 500, "data/all_plants/basil")
download_images("tomato plant", 500, "data/all_plants/tomato")
print("Download completato!")