team-2/test2_with_training/scripts/download_images.py

# scripts/download_images.py

import os
import logging
from icrawler.builtin import BingImageCrawler
from icrawler import ImageDownloader
import requests
from requests.exceptions import RequestException

# 1. Configura il logger per vedere solo warning+  
logging.getLogger('icrawler').setLevel(logging.WARNING)

class ValidatingDownloader(ImageDownloader):
    """
    Override del downloader: prima di scrivere, verifica con HEAD che 
    l'URL sia effettivamente un'immagine e non troppo grande.
    """
    def download(self, task, default_ext, timeout=10, **kwargs):
        url = task['file_url']
        try:
            # HEAD per controllare content-type e lunghezza
            head = requests.head(url, timeout=timeout)
            ctype = head.headers.get('Content-Type', '')
            clen  = int(head.headers.get('Content-Length', 0))
            # Filtro: solo jpeg/png e max 5MB
            if ('image' not in ctype) or (clen > 5_000_000):
                return False
        except RequestException:
            return False

        # Se OK, chiama il downloader originale
        return super().download(task, default_ext, timeout=timeout, **kwargs)

def download_images(query, max_num, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    crawler = BingImageCrawler(
        feeder_threads=1,
        parser_threads=1,
        downloader_threads=2,  # pochi thread per stabilità
        storage={'root_dir': output_dir},
        downloader_cls=ValidatingDownloader
    )
    crawler.crawl(
        keyword=query,
        max_num=max_num,
        min_size=(200, 200),
        max_size=None,
        file_idx_offset=0
    )

if __name__ == "__main__":
    # Scarica immagini di basilico e pomodoro
    download_images("basil plant", 500, "data/all_plants/basil")
    download_images("tomato plant", 500, "data/all_plants/tomato")
    print("Download completato!")
base code test for training a model with basil and tomatoes images to include in step 3 of previous code to improve answer 2025-08-01 23:58:15 +02:00			`# scripts/download_images.py`

			`import os`
			`import logging`
			`from icrawler.builtin import BingImageCrawler`
			`from icrawler import ImageDownloader`
			`import requests`
			`from requests.exceptions import RequestException`

			`# 1. Configura il logger per vedere solo warning+`
			`logging.getLogger('icrawler').setLevel(logging.WARNING)`

			`class ValidatingDownloader(ImageDownloader):`
			`"""`
			`Override del downloader: prima di scrivere, verifica con HEAD che`
			`l'URL sia effettivamente un'immagine e non troppo grande.`
			`"""`
			`def download(self, task, default_ext, timeout=10, **kwargs):`
			`url = task['file_url']`
			`try:`
			`# HEAD per controllare content-type e lunghezza`
			`head = requests.head(url, timeout=timeout)`
			`ctype = head.headers.get('Content-Type', '')`
			`clen = int(head.headers.get('Content-Length', 0))`
			`# Filtro: solo jpeg/png e max 5MB`
			`if ('image' not in ctype) or (clen > 5_000_000):`
			`return False`
			`except RequestException:`
			`return False`

			`# Se OK, chiama il downloader originale`
			`return super().download(task, default_ext, timeout=timeout, **kwargs)`

			`def download_images(query, max_num, output_dir):`
			`os.makedirs(output_dir, exist_ok=True)`
			`crawler = BingImageCrawler(`
			`feeder_threads=1,`
			`parser_threads=1,`
			`downloader_threads=2, # pochi thread per stabilità`
			`storage={'root_dir': output_dir},`
			`downloader_cls=ValidatingDownloader`
			`)`
			`crawler.crawl(`
			`keyword=query,`
			`max_num=max_num,`
			`min_size=(200, 200),`
			`max_size=None,`
			`file_idx_offset=0`
			`)`

			`if __name__ == "__main__":`
			`# Scarica immagini di basilico e pomodoro`
			`download_images("basil plant", 500, "data/all_plants/basil")`
			`download_images("tomato plant", 500, "data/all_plants/tomato")`
			`print("Download completato!")`