Knowledge Base Indexing (Confluence, Notion, SharePoint) for RAG
Corporate knowledge bases are the primary source of context for enterprise RAG systems. Key challenges: incremental synchronization (do not reindex everything on each run), access control management (user should not receive answers from documents they don't access), and handling Confluence/Notion-specific markup.
Confluence Integration
from atlassian import Confluence
from datetime import datetime
class ConfluenceIndexer:
def __init__(self, url: str, username: str, api_token: str):
self.confluence = Confluence(
url=url,
username=username,
password=api_token,
cloud=True # True for Atlassian Cloud
)
self.watermark_store = WatermarkStore()
def get_updated_pages(self, space_key: str) -> list[dict]:
"""Incremental load: only updated pages"""
last_indexed = self.watermark_store.get(f"confluence:{space_key}")
pages = self.confluence.get_all_pages_from_space(
space=space_key,
start=0,
limit=100,
expand='body.storage,metadata,version,ancestors'
)
if last_indexed:
pages = [
p for p in pages
if datetime.fromisoformat(p['version']['when']) > last_indexed
]
return pages
def parse_page(self, page: dict) -> dict:
from bs4 import BeautifulSoup
from markdownify import markdownify
# Confluence stores content in storage format (XHTML)
html_content = page['body']['storage']['value']
soup = BeautifulSoup(html_content, 'html.parser')
# Process Confluence-specific tags
for macro in soup.find_all('ac:structured-macro'):
macro_name = macro.get('ac:name', '')
if macro_name == 'code':
# Code blocks → markdown code blocks
body = macro.find('ac:plain-text-body')
lang = macro.find('ac:parameter', {'ac:name': 'language'})
code = body.get_text() if body else ''
lang_str = lang.get_text() if lang else ''
macro.replace_with(f'\n```{lang_str}\n{code}\n```\n')
else:
macro.decompose()
text = markdownify(str(soup), heading_style="ATX")
return {
'id': page['id'],
'title': page['title'],
'text': text,
'url': f"{self.confluence.url}/wiki{page['_links']['webui']}",
'space': page['space']['key'],
'ancestors': [a['title'] for a in page.get('ancestors', [])],
'labels': [l['name'] for l in page.get('metadata', {}).get('labels', {}).get('results', [])],
'last_modified': page['version']['when'],
'author': page['version']['by']['displayName'],
# Access control for permission-aware search
'restrictions': self._get_page_restrictions(page['id'])
}
Notion Integration
from notion_client import Client
class NotionIndexer:
def __init__(self, token: str):
self.notion = Client(auth=token)
def get_database_pages(self, database_id: str,
last_sync: datetime = None) -> list:
filter_params = {}
if last_sync:
filter_params = {
"filter": {
"timestamp": "last_edited_time",
"last_edited_time": {"after": last_sync.isoformat()}
}
}
results = self.notion.databases.query(
database_id=database_id,
**filter_params
)
return results.get('results', [])
def extract_page_content(self, page_id: str) -> str:
"""Recursive block extraction from page"""
blocks = self.notion.blocks.children.list(block_id=page_id)
return self._blocks_to_text(blocks.get('results', []))
def _blocks_to_text(self, blocks: list) -> str:
text_parts = []
for block in blocks:
block_type = block['type']
if block_type in ['paragraph', 'heading_1', 'heading_2', 'heading_3']:
rich_text = block[block_type].get('rich_text', [])
text = ''.join([rt['plain_text'] for rt in rich_text])
if block_type.startswith('heading'):
level = int(block_type[-1])
text = '#' * level + ' ' + text
text_parts.append(text)
Incremental indexing prevents full reprocessing, while permission-aware filtering ensures security.







