# scripts/download_images.py import os import logging from icrawler.builtin import BingImageCrawler from icrawler import ImageDownloader import requests from requests.exceptions import RequestException # 1. Configura il logger per vedere solo warning+ logging.getLogger('icrawler').setLevel(logging.WARNING) class ValidatingDownloader(ImageDownloader): """ Override del downloader: prima di scrivere, verifica con HEAD che l'URL sia effettivamente un'immagine e non troppo grande. """ def download(self, task, default_ext, timeout=10, **kwargs): url = task['file_url'] try: # HEAD per controllare content-type e lunghezza head = requests.head(url, timeout=timeout) ctype = head.headers.get('Content-Type', '') clen = int(head.headers.get('Content-Length', 0)) # Filtro: solo jpeg/png e max 5MB if ('image' not in ctype) or (clen > 5_000_000): return False except RequestException: return False # Se OK, chiama il downloader originale return super().download(task, default_ext, timeout=timeout, **kwargs) def download_images(query, max_num, output_dir): os.makedirs(output_dir, exist_ok=True) crawler = BingImageCrawler( feeder_threads=1, parser_threads=1, downloader_threads=2, # pochi thread per stabilità storage={'root_dir': output_dir}, downloader_cls=ValidatingDownloader ) crawler.crawl( keyword=query, max_num=max_num, min_size=(200, 200), max_size=None, file_idx_offset=0 ) if __name__ == "__main__": # Scarica immagini di basilico e pomodoro download_images("basil plant", 500, "data/all_plants/basil") download_images("tomato plant", 500, "data/all_plants/tomato") print("Download completato!")