Document Indexing for RAG (PDF, DOCX, HTML, Markdown)
Document indexing is the first and critically important stage of a RAG system. Search quality depends on parsing quality: lost tables, merged text from columns, incorrectly recognized headings — all degrade answer relevance.
Parsing Different Formats
from pathlib import Path
from dataclasses import dataclass
@dataclass
class ParsedDocument:
text: str
metadata: dict
source_format: str
page_count: int = None
class DocumentParser:
def parse(self, file_path: str) -> ParsedDocument:
path = Path(file_path)
ext = path.suffix.lower()
if ext == '.pdf':
return self._parse_pdf(file_path)
elif ext in ['.docx', '.doc']:
return self._parse_docx(file_path)
elif ext in ['.html', '.htm']:
return self._parse_html(file_path)
elif ext in ['.md', '.markdown']:
return self._parse_markdown(file_path)
else:
raise ValueError(f"Unsupported format: {ext}")
def _parse_pdf(self, path: str) -> ParsedDocument:
# For complex PDFs (with tables, columns) — pdfplumber
import pdfplumber
with pdfplumber.open(path) as pdf:
pages_text = []
for page in pdf.pages:
# Preserve tables as markdown
tables = page.extract_tables()
text = page.extract_text() or ""
for table in tables:
table_md = self._table_to_markdown(table)
text += f"\n\n{table_md}\n\n"
pages_text.append(text)
full_text = "\n\n---PAGE BREAK---\n\n".join(pages_text)
return ParsedDocument(
text=full_text,
metadata={"source": path, "pages": len(pdf.pages)},
source_format="pdf",
page_count=len(pdf.pages)
)
def _parse_docx(self, path: str) -> ParsedDocument:
from docx import Document
doc = Document(path)
elements = []
for element in doc.element.body:
if element.tag.endswith('p'): # Paragraph
para = element
style = para.style.name if hasattr(para, 'style') else ''
text = element.text_content()
if style.startswith('Heading'):
level = int(style.split()[-1]) if style[-1].isdigit() else 1
elements.append('#' * level + ' ' + text)
elif text.strip():
elements.append(text)
elif element.tag.endswith('tbl'): # Table
table = self._extract_table_from_docx(element)
elements.append(table)
return ParsedDocument(
text='\n\n'.join(elements),
metadata={"source": path},
source_format="docx"
)
def _parse_html(self, path: str) -> ParsedDocument:
from bs4 import BeautifulSoup
with open(path, 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
# Remove scripts and styles
for tag in soup(['script', 'style', 'nav', 'footer', 'header']):
tag.decompose()
# Extract structured text
from markdownify import markdownify
text = markdownify(str(soup), heading_style="ATX")
return ParsedDocument(
text=text,
metadata={"source": path, "title": soup.title.string if soup.title else ""},
source_format="html"
)
This approach ensures structured, searchable document indexing preserving document hierarchy and table information.







