Email Correspondence Indexing for RAG

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
Email Correspondence Indexing for RAG
Medium
from 1 week to 3 months
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1041
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    823
  • image_logo-aider_0.jpg
    AIDER company logo development
    762
  • image_crm_chasseurs_493_0.webp
    CRM development for Chasseurs
    848

Email Indexing for RAG

Email as a knowledge source for RAG is a non-trivial task: messages contain quoted text from previous messages, signatures, automatic notifications, and spam that must be filtered. However, email often contains unique expert knowledge unavailable in official documentation.

Email Server Connection

import imaplib
import email
from email.header import decode_header

class EmailIndexer:
    def __init__(self, imap_host: str, username: str, password: str):
        self.mail = imaplib.IMAP4_SSL(imap_host)
        self.mail.login(username, password)

    def fetch_emails(self, folder: str = "INBOX",
                     since_date: str = None,
                     max_count: int = 1000) -> list[dict]:
        self.mail.select(folder)

        # Email search
        search_criteria = []
        if since_date:
            search_criteria.append(f'SINCE {since_date}')

        criteria = ' '.join(search_criteria) if search_criteria else 'ALL'
        _, message_ids = self.mail.search(None, criteria)

        emails = []
        ids = message_ids[0].split()[-max_count:]  # Last N emails

        for msg_id in ids:
            _, msg_data = self.mail.fetch(msg_id, '(RFC822)')
            msg = email.message_from_bytes(msg_data[0][1])
            parsed = self._parse_email(msg)
            if parsed:
                emails.append(parsed)

        return emails

    def _parse_email(self, msg: email.message.Message) -> dict | None:
        # Header decoding
        subject = self._decode_header(msg.get('Subject', ''))
        sender = msg.get('From', '')
        date = msg.get('Date', '')

        # Email body extraction
        body = self._extract_body(msg)
        if not body or len(body.split()) < 20:
            return None  # Email too short

        # Cleanup from quotes and signatures
        clean_body = self._clean_email_body(body)

        return {
            'subject': subject,
            'sender': sender,
            'date': date,
            'body': clean_body,
            'thread_id': msg.get('Message-ID', ''),
            'in_reply_to': msg.get('In-Reply-To', ''),
        }

    def _clean_email_body(self, body: str) -> str:
        """Removal of quoted text, signatures, auto-responses"""
        lines = body.split('\n')
        clean_lines = []

        for line in lines:
            # Skip quoted lines (start with >)
            if line.strip().startswith('>'):
                continue
            # Skip standard quote separators
            if re.match(r'^On .* wrote:$', line.strip()):
                break  # Everything after is a quote
            if line.strip().startswith('From:') and len(clean_lines) > 10:
                break
            clean_lines.append(line)

        text = '\n'.join(clean_lines).strip()

        # Remove typical signatures
        signature_markers = [
            'Best regards,', 'Best,', 'Thanks,', 'Regards,',
            'С уважением,', 'Спасибо,'
        ]
        for marker in signature_markers:
            if marker in text:
                idx = text.rfind(marker)
                # If marker at end - it's a signature
                if len(text) - idx < 200:
                    text = text[:idx].strip()
                    break

        return text

Email Relevance Filtering

class EmailRelevanceFilter:
    IGNORE_SENDERS = [
        'noreply@', 'no-reply@', 'donotreply@',
        'newsletter@', 'notifications@', 'alerts@'
    ]

    IGNORE_SUBJECT_PATTERNS = [
        r'^(Re: )?Automatic reply',
        r'^Out of (Office|office)',
        r'^Undelivered Mail Returned',
        r'^\[SPAM\]',
        r'^Meeting (invitation|canceled|accepted)',
    ]

    def is_relevant(self, email_dict: dict) -> tuple[bool, str]:
        sender = email_dict.get('sender', '').lower()
        subject = email_dict.get('subject', '')

        # Automatic emails
        for ignore in self.IGNORE_SENDERS:
            if ignore in sender:
                return False, f"Auto-sender: {ignore}"

        # System notifications
        for pattern in self.IGNORE_SUBJECT_PATTERNS:
            if re.search(pattern, subject, re.IGNORECASE):
                return False, f"System notification: {pattern}"

        # Body too short
        if len(email_dict.get('body', '').split()) < 30:
            return False, "Body too short"

        return True, "relevant"

Thread Reconstruction

def reconstruct_threads(emails: list[dict]) -> list[dict]:
    """Grouping emails into threads for better context"""
    threads = {}
    for email in emails:
        thread_id = email.get('in_reply_to') or email.get('thread_id')
        if thread_id not in threads:
            threads[thread_id] = []
        threads[thread_id].append(email)

    # Creating thread documents
    thread_docs = []
    for thread_id, thread_emails in threads.items():
        # Sort by date
        sorted_emails = sorted(thread_emails, key=lambda e: e.get('date', ''))
        thread_text = '\n\n---\n\n'.join([
            f"**From:** {e['sender']}\n**Date:** {e['date']}\n\n{e['body']}"
            for e in sorted_emails
        ])
        thread_docs.append({
            'thread_id': thread_id,
            'subject': sorted_emails[0]['subject'],
            'text': thread_text,
            'participants': list(set(e['sender'] for e in sorted_emails)),
            'date_range': (sorted_emails[0]['date'], sorted_emails[-1]['date'])
        })

    return thread_docs

When indexing email correspondence, it's important to comply with GDPR and corporate policies: index only work-related emails, exclude personal messages, and provide users with the ability to delete their data upon request.