Setting Up Elasticsearch for Web Application Search
Elasticsearch is a distributed search engine based on Apache Lucene. It's chosen when standard ILIKE '%query%' in PostgreSQL no longer suffices: full-text search with relevance, faceted filtering, autocomplete, geo-search — all are native capabilities of ES.
Installing Elasticsearch 8.x
# Add repository
wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | gpg --dearmor -o /usr/share/keyrings/elasticsearch-keyring.gpg
echo "deb [signed-by=/usr/share/keyrings/elasticsearch-keyring.gpg] https://artifacts.elastic.co/packages/8.x/apt stable main" > /etc/apt/sources.list.d/elastic-8.x.list
apt update && apt install -y elasticsearch
# Save superuser password from installation output
systemctl enable elasticsearch && systemctl start elasticsearch
Minimal config for single-node dev:
# /etc/elasticsearch/elasticsearch.yml
cluster.name: myapp-search
node.name: node-1
path.data: /var/lib/elasticsearch
path.logs: /var/log/elasticsearch
network.host: 127.0.0.1
discovery.type: single-node
xpack.security.enabled: true
xpack.security.http.ssl.enabled: false # for dev; enable in prod
JVM Heap
# /etc/elasticsearch/jvm.options.d/heap.options
# No more than 50% of RAM, no more than 32GB (compressed OOP threshold)
-Xms4g
-Xmx4g
Index mapping
Mapping is the index schema. Wrong mapping cannot be fixed without reindexing:
PUT /products
{
"settings": {
"number_of_shards": 2,
"number_of_replicas": 1,
"analysis": {
"analyzer": {
"russian_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "russian_stop", "russian_stemmer"]
},
"autocomplete_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase", "edge_ngram_filter"]
},
"autocomplete_search": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase"]
}
},
"filter": {
"russian_stop": { "type": "stop", "stopwords": "_russian_" },
"russian_stemmer": { "type": "stemmer", "language": "russian" },
"edge_ngram_filter": { "type": "edge_ngram", "min_gram": 2, "max_gram": 20 }
}
}
},
"mappings": {
"properties": {
"id": { "type": "keyword" },
"name": {
"type": "text",
"analyzer": "russian_analyzer",
"fields": {
"autocomplete": { "type": "text", "analyzer": "autocomplete_analyzer", "search_analyzer": "autocomplete_search" },
"keyword": { "type": "keyword" }
}
},
"description": { "type": "text", "analyzer": "russian_analyzer" },
"category": { "type": "keyword" },
"brand": { "type": "keyword" },
"price": { "type": "scaled_float", "scaling_factor": 100 },
"in_stock": { "type": "boolean" },
"attributes": { "type": "object", "dynamic": true },
"location": { "type": "geo_point" },
"created_at": { "type": "date" }
}
}
}
Search query with facets
POST /products/_search
{
"query": {
"bool": {
"must": [
{
"multi_match": {
"query": "беспроводные наушники",
"fields": ["name^3", "description", "name.autocomplete^2"],
"type": "best_fields",
"fuzziness": "AUTO"
}
}
],
"filter": [
{ "term": { "in_stock": true } },
{ "range": { "price": { "gte": 1000, "lte": 10000 } } },
{ "terms": { "category": ["audio", "headphones"] } }
]
}
},
"aggs": {
"categories": { "terms": { "field": "category", "size": 20 } },
"brands": { "terms": { "field": "brand", "size": 30 } },
"price_ranges": {
"range": {
"field": "price",
"ranges": [
{ "to": 1000 },
{ "from": 1000, "to": 5000 },
{ "from": 5000, "to": 15000 },
{ "from": 15000 }
]
}
}
},
"highlight": {
"fields": { "name": {}, "description": { "fragment_size": 150 } }
},
"from": 0,
"size": 24,
"sort": [{ "_score": "desc" }, { "created_at": "desc" }]
}
Data sync from PostgreSQL
Logical replication via Debezium + Kafka is an industrial solution. For starters, simpler is enough:
// sync/product-indexer.ts
import { Client } from '@elastic/elasticsearch'
import { Pool } from 'pg'
const es = new Client({ node: 'http://localhost:9200', auth: { username: 'elastic', password: process.env.ES_PASSWORD! } })
const pg = new Pool({ connectionString: process.env.DATABASE_URL })
export async function indexProduct(id: string) {
const { rows } = await pg.query(`
SELECT p.*, c.name AS category_name,
json_agg(json_build_object('key', a.key, 'value', a.value)) AS attributes
FROM products p
LEFT JOIN categories c ON c.id = p.category_id
LEFT JOIN product_attributes a ON a.product_id = p.id
WHERE p.id = $1
GROUP BY p.id, c.name
`, [id])
if (!rows.length) {
await es.delete({ index: 'products', id })
return
}
const p = rows[0]
await es.index({
index: 'products',
id: p.id,
document: {
id: p.id,
name: p.name,
description: p.description,
category: p.category_name,
price: p.price,
in_stock: p.stock_quantity > 0,
attributes: Object.fromEntries(p.attributes?.map((a: any) => [a.key, a.value]) ?? []),
created_at: p.created_at
}
})
}
// Full reindexing
export async function reindexAll() {
const { rows } = await pg.query('SELECT id FROM products WHERE deleted_at IS NULL')
const chunks = chunk(rows.map(r => r.id), 100)
for (const ids of chunks) {
await Promise.all(ids.map(indexProduct))
console.log(`Indexed ${ids.length} products`)
}
}
Cluster monitoring
# Cluster health
curl -s http://localhost:9200/_cluster/health?pretty
# Index statistics
curl -s "http://localhost:9200/products/_stats?pretty" | jq '.indices.products.total'
# Slow queries
curl -s "http://localhost:9200/products/_settings" -XPUT -H 'Content-Type: application/json' -d '{
"index.search.slowlog.threshold.query.warn": "2s",
"index.search.slowlog.threshold.query.info": "500ms"
}'
Timelines
Elasticsearch setup, index creation with custom analyzers and integration with application: 3–5 days. Autocomplete, faceted search and synchronization from PostgreSQL setup: 3–5 more days. Cluster of three nodes with Kibana and monitoring: 1–2 weeks.







