Automatic Broken Link Checking
Broken links (404, 500, timeouts) harm SEO and degrade user experience. Automatic checking runs on a schedule and sends a report about issues before users discover them.
Implementing the Link Checker
# broken_link_checker.py
import asyncio
import aiohttp
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from collections import defaultdict
class BrokenLinkChecker:
def __init__(self, base_url: str, concurrency: int = 20):
self.base_url = base_url
self.domain = urlparse(base_url).netloc
self.semaphore = asyncio.Semaphore(concurrency)
self.visited: set[str] = set()
self.broken: list[dict] = []
async def check(self) -> list[dict]:
async with aiohttp.ClientSession(
timeout=aiohttp.ClientTimeout(total=15),
headers={'User-Agent': 'LinkChecker/1.0'},
) as session:
await self.crawl(session, self.base_url, referrer='root')
return self.broken
async def crawl(self, session: aiohttp.ClientSession, url: str, referrer: str):
if url in self.visited:
return
self.visited.add(url)
async with self.semaphore:
try:
async with session.get(url, allow_redirects=True) as resp:
status = resp.status
if status >= 400:
self.broken.append({'url': url, 'status': status, 'referrer': referrer})
return
# Parse only HTML pages of the same domain
if urlparse(url).netloc == self.domain and 'text/html' in resp.headers.get('Content-Type', ''):
html = await resp.text()
links = self.extract_links(url, html)
tasks = [self.crawl(session, link, url) for link in links if link not in self.visited]
await asyncio.gather(*tasks, return_exceptions=True)
except asyncio.TimeoutError:
self.broken.append({'url': url, 'status': 'timeout', 'referrer': referrer})
except Exception as e:
self.broken.append({'url': url, 'status': str(e), 'referrer': referrer})
def extract_links(self, base: str, html: str) -> list[str]:
soup = BeautifulSoup(html, 'lxml')
links = []
for tag in soup.find_all(['a', 'link', 'img', 'script'], href=True):
href = tag.get('href') or tag.get('src')
if href:
absolute = urljoin(base, href)
parsed = urlparse(absolute)
if parsed.scheme in ('http', 'https'):
links.append(absolute.split('#')[0])
return list(set(links))
Scheduler and Notifications
# scheduler.py (Celery Beat)
from celery import Celery
from broken_link_checker import BrokenLinkChecker
import asyncio
import requests
app = Celery('tasks', broker='redis://localhost:6379/0')
@app.task
def check_broken_links():
checker = BrokenLinkChecker('https://example.com', concurrency=15)
broken = asyncio.run(checker.check())
if not broken:
return {'status': 'ok', 'checked': len(checker.visited)}
# Send report to Slack
message = f"🔗 Found {len(broken)} broken links:\n"
for item in broken[:10]: # First 10
message += f"• `{item['status']}` {item['url']}\n ← {item['referrer']}\n"
if len(broken) > 10:
message += f"...and {len(broken) - 10} more. Full report in CSV.\n"
requests.post(os.env['SLACK_WEBHOOK'], json={'text': message})
# Save to DB for history
BrokenLinkReport.objects.create(
checked_at = timezone.now(),
total_links = len(checker.visited),
broken_count = len(broken),
details = broken,
)
return {'broken': len(broken)}
# Schedule: every day at 6:00
app.conf.beat_schedule = {
'daily-link-check': {
'task': 'scheduler.check_broken_links',
'schedule': crontab(hour=6, minute=0),
},
}
Timeframe
Async crawler with Celery scheduling and Slack notifications: 1–2 business days.







