Protecting API from Scraping and Bot Detection
API scraping — systematic data collection at higher frequency than normal human interaction. Without protection, competitor can download entire product catalog in hours, auto-pick passwords, or parse contact database. Task is to distinguish bot from human without harming legitimate users.
Protection Layers
Client → WAF (IP reputation) → Rate Limiting → Bot Detection → API Logic
↓
Fingerprint + Behavioral Analysis + CAPTCHA
Each layer filters part of traffic. Ideal protection is combination of several methods, none of which is perfect alone.
Bot Signals and Their Weight
| Signal | Weight | Description |
|---|---|---|
Missing User-Agent / curl / python-requests |
+40 | Typical automatic clients |
No Accept-Language / Accept-Encoding |
+20 | Browser always sends these |
| Requests every N ms perfectly | +35 | Human cannot be so precise |
| Same URL pattern (sequential traversal) | +30 | /items/1, /items/2, /items/3... |
| No Referer on navigation | +15 | Browser usually sends it |
| Many requests from same IP range | +25 | Distributed bot |
| Atypical TLS fingerprint (JA3) | +30 | Node.js/Python TLS differs from browser |
Behavior-Based Detector
import time
import statistics
from collections import defaultdict, deque
class BotDetector:
def __init__(self, redis_client):
self.r = redis_client
self.window = 300 # 5-minute analysis window
def analyze_request(self, request) -> dict:
"""Returns score (0-100) and suspicion reasons"""
score = 0
reasons = []
# 1. Browser headers
headers = request.headers
ua = headers.get('User-Agent', '')
bot_uas = ['python-requests', 'curl', 'wget', 'Go-http-client',
'Java/', 'okhttp', 'axios', 'node-fetch']
for bot_ua in bot_uas:
if bot_ua.lower() in ua.lower():
score += 40
reasons.append(f'bot_useragent:{bot_ua}')
break
if not ua:
score += 40
reasons.append('no_useragent')
if not headers.get('Accept-Language'):
score += 20
reasons.append('no_accept_language')
if not headers.get('Accept-Encoding'):
score += 15
reasons.append('no_accept_encoding')
# 2. Request timing analysis
ip = request.remote_addr
timing_score = self._analyze_timing(ip)
if timing_score > 0:
score += timing_score
reasons.append(f'suspicious_timing:{timing_score}')
# 3. URL pattern (sequential traversal)
path = request.path
pattern_score = self._analyze_url_pattern(ip, path)
if pattern_score > 0:
score += pattern_score
reasons.append(f'url_pattern:{pattern_score}')
# 4. JA3 TLS fingerprint (via nginx variable)
ja3 = headers.get('X-JA3-Fingerprint')
if ja3 and self._is_suspicious_ja3(ja3):
score += 30
reasons.append(f'suspicious_ja3:{ja3[:16]}')
return {
'score': min(score, 100),
'is_bot': score >= 60,
'reasons': reasons,
'action': self._get_action(score)
}
def _analyze_timing(self, ip: str) -> int:
"""Analyze intervals between requests"""
key = f"timing:{ip}"
now = time.time()
# Store timestamp
self.r.lpush(key, now)
self.r.ltrim(key, 0, 49) # last 50 requests
self.r.expire(key, self.window)
timestamps = [float(t) for t in self.r.lrange(key, 0, -1)]
if len(timestamps) < 5:
return 0
# Compute intervals between requests
timestamps.sort()
intervals = [timestamps[i+1] - timestamps[i]
for i in range(len(timestamps)-1)]
if not intervals:
return 0
avg = statistics.mean(intervals)
stdev = statistics.stdev(intervals) if len(intervals) > 1 else 0
# Coefficient of variation < 0.1 means machine precision
cv = stdev / avg if avg > 0 else 0
if cv < 0.05 and avg < 2.0: # very regular, fast requests
return 35
if cv < 0.15 and avg < 1.0: # regular, very fast
return 25
return 0







