Developing a Supplier Product Scraper Bot
A product scraper automates data retrieval from supplier websites without manual copying. The result — structured data in your database, updated on schedule.
Scraper architecture
Scheduler (cron / Horizon)
→ Scraper Job
→ HTTP Client (Guzzle / curl)
→ HTML Parser (Symfony DomCrawler / Goutte)
→ Data Normalizer
→ Duplicate Checker
→ Product Repository
→ Notification (on errors)
Tools stack
| Task | Tool |
|---|---|
| HTTP requests | Guzzle 7 |
| HTML parsing | Symfony DomCrawler + CSS Selector |
| JS sites | Puppeteer (Node) / Playwright |
| Queues | Laravel Queue + Redis |
| Proxy | Rotating proxy pool |
| Storage | PostgreSQL / MySQL |
Basic PHP scraper
// app/Services/Scrapers/SupplierScraper.php
use GuzzleHttp\Client;
use Symfony\Component\DomCrawler\Crawler;
class SupplierScraper
{
private Client $client;
public function __construct(
private string $baseUrl,
private array $proxyPool = []
) {
$this->client = new Client([
'timeout' => 15,
'connect_timeout' => 5,
'headers' => [
'User-Agent' => $this->randomUserAgent(),
'Accept-Language' => 'uk-UA,uk;q=0.9',
'Accept' => 'text/html,application/xhtml+xml',
],
]);
}
public function scrapeProductList(string $categoryUrl): array
{
$html = $this->fetchWithRetry($categoryUrl);
$crawler = new Crawler($html);
return $crawler->filter('.product-card')->each(function (Crawler $node) {
return [
'url' => $node->filter('a.product-link')->attr('href'),
'title' => trim($node->filter('.product-title')->text()),
'price' => $this->parsePrice($node->filter('.price')->text()),
'sku' => $node->filter('[data-sku]')->attr('data-sku'),
];
});
}
public function scrapeProductDetail(string $productUrl): array
{
$html = $this->fetchWithRetry($this->baseUrl . $productUrl);
$crawler = new Crawler($html);
return [
'title' => $crawler->filter('h1.product-name')->text(),
'description' => $crawler->filter('.description')->html(),
'price' => $this->parsePrice($crawler->filter('.current-price')->text()),
'images' => $crawler->filter('.gallery img')->each(
fn(Crawler $img) => $img->attr('src')
),
'in_stock' => $crawler->filter('.in-stock')->count() > 0,
'sku' => $crawler->filter('[itemprop="sku"]')->text(''),
];
}
private function fetchWithRetry(string $url, int $attempts = 3): string
{
$proxy = $this->proxyPool ? $this->randomProxy() : null;
for ($i = 0; $i < $attempts; $i++) {
try {
$options = $proxy ? ['proxy' => $proxy] : [];
$response = $this->client->get($url, $options);
return (string) $response->getBody();
} catch (\Exception $e) {
if ($i === $attempts - 1) throw $e;
sleep(rand(2, 5));
}
}
}
private function parsePrice(string $text): float
{
return (float) preg_replace('/[^\d.,]/', '', str_replace(',', '.', $text));
}
}
Background Job
// app/Jobs/ScrapeSupplierProducts.php
class ScrapeSupplierProducts implements ShouldQueue
{
use Queueable;
public int $tries = 2;
public int $timeout = 300; // 5 minutes per category
public function __construct(
private int $supplierId,
private string $categoryUrl
) {}
public function handle(
SupplierScraper $scraper,
ProductImportService $importer
): void {
$products = $scraper->scrapeProductList($this->categoryUrl);
foreach ($products as $productPreview) {
// Detail scraping per product
ScrapeSupplierProductDetail::dispatch(
$this->supplierId,
$productPreview['url']
)->onQueue('scraper-detail');
usleep(rand(500000, 1500000)); // 0.5–1.5 sec
}
}
}
Deduplication
// app/Services/ProductImportService.php
class ProductImportService
{
public function upsert(int $supplierId, array $data): void
{
$product = SupplierProduct::updateOrCreate(
[
'supplier_id' => $supplierId,
'supplier_sku' => $data['sku'],
],
[
'title' => $data['title'],
'price' => $data['price'],
'in_stock' => $data['in_stock'],
'description' => $data['description'],
'images' => json_encode($data['images']),
'scraped_at' => now(),
]
);
if ($product->wasChanged('price')) {
$change = abs($product->price - $product->getOriginal('price'));
if ($change / $product->getOriginal('price') > 0.05) {
PriceChangedNotification::dispatch($product);
}
}
}
}
Schedule
// app/Console/Kernel.php
protected function schedule(Schedule $schedule): void
{
// Full catalog pass — once daily at night
$schedule->command('scraper:supplier --supplier=1')
->dailyAt('03:00')
->withoutOverlapping();
// Prices and stock only — every 4 hours
$schedule->command('scraper:supplier --supplier=1 --prices-only')
->everyFourHours()
->withoutOverlapping();
}
Basic supplier scraper (static HTML, 5-10 fields): 3-5 business days, including queue setup, scheduling, and basic monitoring.







