Knowledge Base Indexing (Confluence, Notion, SharePoint) for RAG

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
Knowledge Base Indexing (Confluence, Notion, SharePoint) for RAG
Medium
from 1 week to 3 months
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1041
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    823
  • image_logo-aider_0.jpg
    AIDER company logo development
    762
  • image_crm_chasseurs_493_0.webp
    CRM development for Chasseurs
    848

Knowledge Base Indexing (Confluence, Notion, SharePoint) for RAG

Corporate knowledge bases are the primary source of context for enterprise RAG systems. Key challenges: incremental synchronization (do not reindex everything on each run), access control management (user should not receive answers from documents they don't access), and handling Confluence/Notion-specific markup.

Confluence Integration

from atlassian import Confluence
from datetime import datetime

class ConfluenceIndexer:
    def __init__(self, url: str, username: str, api_token: str):
        self.confluence = Confluence(
            url=url,
            username=username,
            password=api_token,
            cloud=True  # True for Atlassian Cloud
        )
        self.watermark_store = WatermarkStore()

    def get_updated_pages(self, space_key: str) -> list[dict]:
        """Incremental load: only updated pages"""
        last_indexed = self.watermark_store.get(f"confluence:{space_key}")

        pages = self.confluence.get_all_pages_from_space(
            space=space_key,
            start=0,
            limit=100,
            expand='body.storage,metadata,version,ancestors'
        )

        if last_indexed:
            pages = [
                p for p in pages
                if datetime.fromisoformat(p['version']['when']) > last_indexed
            ]

        return pages

    def parse_page(self, page: dict) -> dict:
        from bs4 import BeautifulSoup
        from markdownify import markdownify

        # Confluence stores content in storage format (XHTML)
        html_content = page['body']['storage']['value']
        soup = BeautifulSoup(html_content, 'html.parser')

        # Process Confluence-specific tags
        for macro in soup.find_all('ac:structured-macro'):
            macro_name = macro.get('ac:name', '')
            if macro_name == 'code':
                # Code blocks → markdown code blocks
                body = macro.find('ac:plain-text-body')
                lang = macro.find('ac:parameter', {'ac:name': 'language'})
                code = body.get_text() if body else ''
                lang_str = lang.get_text() if lang else ''
                macro.replace_with(f'\n```{lang_str}\n{code}\n```\n')
            else:
                macro.decompose()

        text = markdownify(str(soup), heading_style="ATX")

        return {
            'id': page['id'],
            'title': page['title'],
            'text': text,
            'url': f"{self.confluence.url}/wiki{page['_links']['webui']}",
            'space': page['space']['key'],
            'ancestors': [a['title'] for a in page.get('ancestors', [])],
            'labels': [l['name'] for l in page.get('metadata', {}).get('labels', {}).get('results', [])],
            'last_modified': page['version']['when'],
            'author': page['version']['by']['displayName'],
            # Access control for permission-aware search
            'restrictions': self._get_page_restrictions(page['id'])
        }

Notion Integration

from notion_client import Client

class NotionIndexer:
    def __init__(self, token: str):
        self.notion = Client(auth=token)

    def get_database_pages(self, database_id: str,
                            last_sync: datetime = None) -> list:
        filter_params = {}
        if last_sync:
            filter_params = {
                "filter": {
                    "timestamp": "last_edited_time",
                    "last_edited_time": {"after": last_sync.isoformat()}
                }
            }

        results = self.notion.databases.query(
            database_id=database_id,
            **filter_params
        )
        return results.get('results', [])

    def extract_page_content(self, page_id: str) -> str:
        """Recursive block extraction from page"""
        blocks = self.notion.blocks.children.list(block_id=page_id)
        return self._blocks_to_text(blocks.get('results', []))

    def _blocks_to_text(self, blocks: list) -> str:
        text_parts = []
        for block in blocks:
            block_type = block['type']
            if block_type in ['paragraph', 'heading_1', 'heading_2', 'heading_3']:
                rich_text = block[block_type].get('rich_text', [])
                text = ''.join([rt['plain_text'] for rt in rich_text])
                if block_type.startswith('heading'):
                    level = int(block_type[-1])
                    text = '#' * level + ' ' + text
                text_parts.append(text)

Incremental indexing prevents full reprocessing, while permission-aware filtering ensures security.