Developing a Product Image Scraper Bot
An image scraper is a specialized tool: downloads, normalizes, and saves product photos. The task is more complex than it seems: lazy loading, hotlink protection, watermark removal, deduplication by hash, conversion to WebP.
Architecture
Scraper → Image URL Extractor
→ Downloader (async, proxy)
→ Image Processor (resize, convert, strip meta)
→ Hash Deduplicator
→ Storage (S3/local) + CDN
→ DB (product_images table)
Extracting Image URLs
// app/Services/ImageScraper/ImageUrlExtractor.php
use Symfony\Component\DomCrawler\Crawler;
class ImageUrlExtractor
{
public function extract(string $html, string $baseUrl): array
{
$crawler = new Crawler($html);
$urls = [];
// Standard img tags
$crawler->filter('img[src], img[data-src], img[data-lazy-src]')->each(
function (Crawler $node) use (&$urls, $baseUrl) {
$src = $node->attr('data-src')
?? $node->attr('data-lazy-src')
?? $node->attr('src');
if ($src && !str_starts_with($src, 'data:')) {
$urls[] = $this->absoluteUrl($src, $baseUrl);
}
}
);
// srcset (responsive images)
$crawler->filter('img[srcset], source[srcset]')->each(
function (Crawler $node) use (&$urls, $baseUrl) {
$srcset = $node->attr('srcset');
foreach ($this->parseSrcset($srcset) as $url) {
$urls[] = $this->absoluteUrl($url, $baseUrl);
}
}
);
// JSON-LD and OG tags
$crawler->filter('meta[property="og:image"]')->each(
function (Crawler $node) use (&$urls) {
$urls[] = $node->attr('content');
}
);
// Select highest resolution from srcset
return $this->selectHighResImages(array_unique(array_filter($urls)));
}
private function parseSrcset(string $srcset): array
{
$urls = [];
foreach (array_filter(array_map('trim', explode(',', $srcset))) as $part) {
$components = preg_split('/\s+/', trim($part));
if ($components) $urls[] = $components[0];
}
return $urls;
}
private function absoluteUrl(string $url, string $baseUrl): string
{
if (str_starts_with($url, '//')) return 'https:' . $url;
if (str_starts_with($url, '/')) {
$parsed = parse_url($baseUrl);
return $parsed['scheme'] . '://' . $parsed['host'] . $url;
}
return $url;
}
private function selectHighResImages(array $urls): array
{
// Filter thumbnails and icons by URL patterns
return array_filter($urls, function (string $url) {
$lower = strtolower($url);
return !preg_match('/thumb|small|icon|logo|favicon|_s\.|_t\./i', $lower)
&& preg_match('/\.(jpg|jpeg|png|webp|gif)(\?.*)?$/i', $lower);
});
}
}
Async Download
// app/Services/ImageScraper/ImageDownloader.php
use GuzzleHttp\Client;
use GuzzleHttp\Pool;
use GuzzleHttp\Psr7\Request;
class ImageDownloader
{
private Client $client;
public function __construct(array $proxyPool = [])
{
$this->client = new Client([
'timeout' => 30,
'headers' => [
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
'Accept' => 'image/webp,image/apng,image/*,*/*;q=0.8',
'Referer' => '',
],
'allow_redirects' => ['max' => 5],
]);
}
public function downloadBatch(array $urls, string $referer): array
{
$requests = function() use ($urls, $referer) {
foreach ($urls as $url) {
yield new Request('GET', $url, ['Referer' => $referer]);
}
};
$results = [];
$pool = new Pool($this->client, $requests(), [
'concurrency' => 5,
'fulfilled' => function ($response, $index) use ($urls, &$results) {
$content = (string) $response->getBody();
$contentType = $response->getHeader('Content-Type')[0] ?? '';
if ($this->isValidImage($content, $contentType)) {
$results[$urls[$index]] = $content;
}
},
'rejected' => function ($reason, $index) use ($urls) {
\Log::warning("Failed to download: {$urls[$index]}: {$reason}");
},
]);
$pool->promise()->wait();
return $results;
}
private function isValidImage(string $content, string $contentType): bool
{
if (!str_starts_with($contentType, 'image/')) return false;
// Minimum size — protection from 1x1 px placeholders
return strlen($content) > 5000;
}
}
Image Processing
// app/Services/ImageScraper/ImageProcessor.php
use Intervention\Image\ImageManager;
use Intervention\Image\Drivers\Gd\Driver;
class ImageProcessor
{
private ImageManager $manager;
public function __construct()
{
$this->manager = new ImageManager(new Driver());
}
public function process(string $rawContent, array $options = []): ProcessedImage
{
$image = $this->manager->read($rawContent);
// Compute perceptual hash BEFORE processing (for deduplication)
$hash = $this->perceptualHash($image);
// Normalize size
$maxWidth = $options['max_width'] ?? 1200;
$maxHeight = $options['max_height'] ?? 1200;
if ($image->width() > $maxWidth || $image->height() > $maxHeight) {
$image->scaleDown($maxWidth, $maxHeight);
}
// Remove EXIF data (contains GPS and personal data)
// Intervention Image removes them automatically when encoding
// Convert to WebP
$webpContent = (string) $image->toWebp(quality: 85);
$jpegContent = (string) $image->toJpeg(quality: 85);
return new ProcessedImage(
hash: $hash,
webp: $webpContent,
jpeg: $jpegContent,
width: $image->width(),
height: $image->height(),
);
}
private function perceptualHash(mixed $image): string
{
// Resize to 8x8, convert to grayscale, compute delta
$small = clone $image;
$small->resize(9, 8)->greyscale();
$bits = '';
for ($y = 0; $y < 8; $y++) {
for ($x = 0; $x < 8; $x++) {
$left = $small->pickColor($x, $y)->red();
$right = $small->pickColor($x + 1, $y)->red();
$bits .= $left > $right ? '1' : '0';
}
}
return base_convert($bits, 2, 16);
}
}
Saving to S3 and Database
// app/Jobs/DownloadAndStoreProductImages.php
class DownloadAndStoreProductImages implements ShouldQueue
{
public int $tries = 3;
public function handle(
ImageUrlExtractor $extractor,
ImageDownloader $downloader,
ImageProcessor $processor,
ImageStorage $storage
): void {
$urls = $extractor->extract($this->html, $this->productUrl);
$rawImages = $downloader->downloadBatch($urls, $this->productUrl);
$position = 0;
foreach ($rawImages as $url => $content) {
$processed = $processor->process($content);
// Skip duplicates by perceptual hash
if (ProductImage::where('phash', $processed->hash)->exists()) {
continue;
}
$path = $storage->store($this->productId, $processed);
ProductImage::create([
'product_id' => $this->productId,
'source_url' => $url,
'path' => $path,
'phash' => $processed->hash,
'width' => $processed->width,
'height' => $processed->height,
'position' => $position++,
]);
}
}
}
Hotlink Protection on Source
Some sites return a placeholder instead of a real image when Referer is missing. Solution:
// Always pass Referer when downloading
$downloader->setReferer($productPageUrl);
// Check Content-Type and minimum size
// If received < 5KB — likely a "403 Forbidden Image" placeholder
Development Timeline
Product image scraper for one source with S3 storage: 3-5 business days, including deduplication and WebP conversion.







