Email Indexing for RAG
Email as a knowledge source for RAG is a non-trivial task: messages contain quoted text from previous messages, signatures, automatic notifications, and spam that must be filtered. However, email often contains unique expert knowledge unavailable in official documentation.
Email Server Connection
import imaplib
import email
from email.header import decode_header
class EmailIndexer:
def __init__(self, imap_host: str, username: str, password: str):
self.mail = imaplib.IMAP4_SSL(imap_host)
self.mail.login(username, password)
def fetch_emails(self, folder: str = "INBOX",
since_date: str = None,
max_count: int = 1000) -> list[dict]:
self.mail.select(folder)
# Email search
search_criteria = []
if since_date:
search_criteria.append(f'SINCE {since_date}')
criteria = ' '.join(search_criteria) if search_criteria else 'ALL'
_, message_ids = self.mail.search(None, criteria)
emails = []
ids = message_ids[0].split()[-max_count:] # Last N emails
for msg_id in ids:
_, msg_data = self.mail.fetch(msg_id, '(RFC822)')
msg = email.message_from_bytes(msg_data[0][1])
parsed = self._parse_email(msg)
if parsed:
emails.append(parsed)
return emails
def _parse_email(self, msg: email.message.Message) -> dict | None:
# Header decoding
subject = self._decode_header(msg.get('Subject', ''))
sender = msg.get('From', '')
date = msg.get('Date', '')
# Email body extraction
body = self._extract_body(msg)
if not body or len(body.split()) < 20:
return None # Email too short
# Cleanup from quotes and signatures
clean_body = self._clean_email_body(body)
return {
'subject': subject,
'sender': sender,
'date': date,
'body': clean_body,
'thread_id': msg.get('Message-ID', ''),
'in_reply_to': msg.get('In-Reply-To', ''),
}
def _clean_email_body(self, body: str) -> str:
"""Removal of quoted text, signatures, auto-responses"""
lines = body.split('\n')
clean_lines = []
for line in lines:
# Skip quoted lines (start with >)
if line.strip().startswith('>'):
continue
# Skip standard quote separators
if re.match(r'^On .* wrote:$', line.strip()):
break # Everything after is a quote
if line.strip().startswith('From:') and len(clean_lines) > 10:
break
clean_lines.append(line)
text = '\n'.join(clean_lines).strip()
# Remove typical signatures
signature_markers = [
'Best regards,', 'Best,', 'Thanks,', 'Regards,',
'С уважением,', 'Спасибо,'
]
for marker in signature_markers:
if marker in text:
idx = text.rfind(marker)
# If marker at end - it's a signature
if len(text) - idx < 200:
text = text[:idx].strip()
break
return text
Email Relevance Filtering
class EmailRelevanceFilter:
IGNORE_SENDERS = [
'noreply@', 'no-reply@', 'donotreply@',
'newsletter@', 'notifications@', 'alerts@'
]
IGNORE_SUBJECT_PATTERNS = [
r'^(Re: )?Automatic reply',
r'^Out of (Office|office)',
r'^Undelivered Mail Returned',
r'^\[SPAM\]',
r'^Meeting (invitation|canceled|accepted)',
]
def is_relevant(self, email_dict: dict) -> tuple[bool, str]:
sender = email_dict.get('sender', '').lower()
subject = email_dict.get('subject', '')
# Automatic emails
for ignore in self.IGNORE_SENDERS:
if ignore in sender:
return False, f"Auto-sender: {ignore}"
# System notifications
for pattern in self.IGNORE_SUBJECT_PATTERNS:
if re.search(pattern, subject, re.IGNORECASE):
return False, f"System notification: {pattern}"
# Body too short
if len(email_dict.get('body', '').split()) < 30:
return False, "Body too short"
return True, "relevant"
Thread Reconstruction
def reconstruct_threads(emails: list[dict]) -> list[dict]:
"""Grouping emails into threads for better context"""
threads = {}
for email in emails:
thread_id = email.get('in_reply_to') or email.get('thread_id')
if thread_id not in threads:
threads[thread_id] = []
threads[thread_id].append(email)
# Creating thread documents
thread_docs = []
for thread_id, thread_emails in threads.items():
# Sort by date
sorted_emails = sorted(thread_emails, key=lambda e: e.get('date', ''))
thread_text = '\n\n---\n\n'.join([
f"**From:** {e['sender']}\n**Date:** {e['date']}\n\n{e['body']}"
for e in sorted_emails
])
thread_docs.append({
'thread_id': thread_id,
'subject': sorted_emails[0]['subject'],
'text': thread_text,
'participants': list(set(e['sender'] for e in sorted_emails)),
'date_range': (sorted_emails[0]['date'], sorted_emails[-1]['date'])
})
return thread_docs
When indexing email correspondence, it's important to comply with GDPR and corporate policies: index only work-related emails, exclude personal messages, and provide users with the ability to delete their data upon request.







