Developing a Product Description and Specifications Scraper Bot
Parsing product text content is about normalizing unstructured data into a unified schema. Each site stores characteristics differently: some in tables, others in JSON-LD, third parties in microdata. The task — extract data independently of structure.
What Gets Extracted
- Descriptions: short and full, HTML-formatted or plain text
- Characteristics: key-value pairs from tables and lists
- Metadata: brand, country of origin, warranty
- Structured data: JSON-LD (Schema.org Product), microdata, OpenGraph
Multi-Layer Extraction Strategy
// app/Services/ContentScraper/ProductContentExtractor.php
use Symfony\Component\DomCrawler\Crawler;
class ProductContentExtractor
{
/**
* Try sources in priority order:
* 1. JSON-LD (most reliable, structured data)
* 2. Microdata (item attributes)
* 3. CSS selectors from supplier config
* 4. Heuristic algorithm (fallback)
*/
public function extract(string $html, string $url, array $siteConfig = []): array
{
$crawler = new Crawler($html);
$data = $this->extractFromJsonLd($crawler)
?? $this->extractFromMicrodata($crawler)
?? $this->extractWithSelectors($crawler, $siteConfig)
?? $this->extractHeuristic($crawler);
$data['specs'] = $this->extractSpecs($crawler, $siteConfig);
$data['source_url'] = $url;
return $data;
}
private function extractFromJsonLd(Crawler $crawler): ?array
{
$scripts = $crawler->filter('script[type="application/ld+json"]');
foreach ($scripts as $script) {
$json = json_decode($script->textContent, true);
if (!$json) continue;
// Handle @graph
$items = isset($json['@graph']) ? $json['@graph'] : [$json];
foreach ($items as $item) {
$type = $item['@type'] ?? '';
if (!in_array($type, ['Product', 'IndividualProduct'])) continue;
return [
'name' => $item['name'] ?? null,
'description' => strip_tags($item['description'] ?? ''),
'brand' => $item['brand']['name'] ?? $item['brand'] ?? null,
'sku' => $item['sku'] ?? $item['mpn'] ?? null,
'gtin' => $item['gtin13'] ?? $item['gtin'] ?? null,
];
}
}
return null;
}
private function extractFromMicrodata(Crawler $crawler): ?array
{
$product = $crawler->filter('[itemtype*="schema.org/Product"]');
if (!$product->count()) return null;
$get = fn(string $prop) => $product->filter("[itemprop=\"{$prop}\"]")->first()->count()
? trim($product->filter("[itemprop=\"{$prop}\"]")->first()->text(''))
: null;
return [
'name' => $get('name'),
'description' => $get('description'),
'brand' => $get('brand'),
'sku' => $get('sku'),
];
}
}
Extracting Specifications
private function extractSpecs(Crawler $crawler, array $config): array
{
$specs = [];
// Strategy 1: table with two columns
$crawler->filter('table.specs tr, table.characteristics tr, .attributes-table tr')->each(
function (Crawler $row) use (&$specs) {
$cells = $row->filter('td, th');
if ($cells->count() >= 2) {
$key = trim($cells->first()->text());
$val = trim($cells->eq(1)->text());
if ($key && $val && $key !== $val) {
$specs[$key] = $val;
}
}
}
);
// Strategy 2: dl/dt/dd
if (empty($specs)) {
$crawler->filter('dl')->each(function (Crawler $dl) use (&$specs) {
$keys = $dl->filter('dt')->each(fn(Crawler $n) => trim($n->text()));
$vals = $dl->filter('dd')->each(fn(Crawler $n) => trim($n->text()));
$specs = array_merge($specs, array_combine($keys, $vals));
});
}
return $specs;
}
Data Normalization
Characteristics from different suppliers are named differently. A normalizer brings them to a single schema:
// app/Services/ContentScraper/SpecsNormalizer.php
class SpecsNormalizer
{
private array $synonyms = [
'weight' => ['Weight', 'Mass', 'Net Weight', 'Item Weight'],
'color' => ['Color', 'Colour', 'Shade', 'Hue'],
'brand' => ['Brand', 'Manufacturer', 'Make', 'Producer'],
'country' => ['Country', 'Country of Origin', 'Made In'],
'material'=> ['Material', 'Composition', 'Fabric', 'Substance'],
];
public function normalize(array $rawSpecs): array
{
$normalized = [];
foreach ($rawSpecs as $rawKey => $value) {
$normalKey = $this->findNormalKey($rawKey) ?? $this->slug($rawKey);
$normalized[$normalKey] = $this->normalizeValue($normalKey, $value);
}
return $normalized;
}
private function normalizeValue(string $key, string $value): mixed
{
return match ($key) {
'weight' => $this->normalizeWeight($value),
default => trim($value),
};
}
private function normalizeWeight(string $value): ?float
{
// "1.5 kg" → 1500 (grams), "500 g" → 500
if (preg_match('/(\d+[\.,]?\d*)\s*(kg|k)/ui', $value, $m)) {
return (float) str_replace(',', '.', $m[1]) * 1000;
}
if (preg_match('/(\d+[\.,]?\d*)\s*(g|gr)/ui', $value, $m)) {
return (float) str_replace(',', '.', $m[1]);
}
return null;
}
}
Development Timeline
Description + specs scraper for one site with normalization: 3-5 business days. Universal extractor with 5+ sources and synonym dictionary: 8-12 days.







