56 lines
1.8 KiB
Python
56 lines
1.8 KiB
Python
|
# scripts/download_images.py
|
||
|
|
||
|
import os
|
||
|
import logging
|
||
|
from icrawler.builtin import BingImageCrawler
|
||
|
from icrawler import ImageDownloader
|
||
|
import requests
|
||
|
from requests.exceptions import RequestException
|
||
|
|
||
|
# 1. Configura il logger per vedere solo warning+
|
||
|
logging.getLogger('icrawler').setLevel(logging.WARNING)
|
||
|
|
||
|
class ValidatingDownloader(ImageDownloader):
|
||
|
"""
|
||
|
Override del downloader: prima di scrivere, verifica con HEAD che
|
||
|
l'URL sia effettivamente un'immagine e non troppo grande.
|
||
|
"""
|
||
|
def download(self, task, default_ext, timeout=10, **kwargs):
|
||
|
url = task['file_url']
|
||
|
try:
|
||
|
# HEAD per controllare content-type e lunghezza
|
||
|
head = requests.head(url, timeout=timeout)
|
||
|
ctype = head.headers.get('Content-Type', '')
|
||
|
clen = int(head.headers.get('Content-Length', 0))
|
||
|
# Filtro: solo jpeg/png e max 5MB
|
||
|
if ('image' not in ctype) or (clen > 5_000_000):
|
||
|
return False
|
||
|
except RequestException:
|
||
|
return False
|
||
|
|
||
|
# Se OK, chiama il downloader originale
|
||
|
return super().download(task, default_ext, timeout=timeout, **kwargs)
|
||
|
|
||
|
def download_images(query, max_num, output_dir):
|
||
|
os.makedirs(output_dir, exist_ok=True)
|
||
|
crawler = BingImageCrawler(
|
||
|
feeder_threads=1,
|
||
|
parser_threads=1,
|
||
|
downloader_threads=2, # pochi thread per stabilità
|
||
|
storage={'root_dir': output_dir},
|
||
|
downloader_cls=ValidatingDownloader
|
||
|
)
|
||
|
crawler.crawl(
|
||
|
keyword=query,
|
||
|
max_num=max_num,
|
||
|
min_size=(200, 200),
|
||
|
max_size=None,
|
||
|
file_idx_offset=0
|
||
|
)
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
# Scarica immagini di basilico e pomodoro
|
||
|
download_images("basil plant", 500, "data/all_plants/basil")
|
||
|
download_images("tomato plant", 500, "data/all_plants/tomato")
|
||
|
print("Download completato!")
|