Training AI-Agents on Employee and Corporate Knowledge Base Data
A corporate AI-agent trained on internal data understands company context: document standards, historical solutions, processes, industry terminology. Unlike a general LLM, such an agent provides answers specific to your organization rather than generalized internet recommendations.
Training Approaches
RAG (Retrieval-Augmented Generation) — the most common approach: index documents in a vector database, retrieve relevant fragments on query. Requires no model retraining.
Fine-tuning — retraining on corporate dialogs and documents. The model learns style, terminology, typical response formats.
Hybrid approach — fine-tune on style + RAG on up-to-date knowledge. Optimal for production.
Collecting and Preparing Corporate Data
from pathlib import Path
from typing import Generator
import json
class CorporateDataCollector:
"""Collect data from corporate sources"""
async def collect_from_confluence(self, space_keys: list[str]) -> list[dict]:
"""Confluence pages"""
docs = []
for space in space_keys:
pages = await confluence_client.get_all_pages(space)
for page in pages:
content = await confluence_client.get_page_content(page["id"])
docs.append({
"source": "confluence",
"id": page["id"],
"title": page["title"],
"content": html_to_text(content),
"updated_at": page["version"]["when"],
"labels": page.get("labels", []),
"space": space,
})
return docs
async def collect_from_email_threads(
self,
email_accounts: list[str],
filter_subjects: list[str] = None,
anonymize_pii: bool = True,
) -> list[dict]:
"""Email correspondence as training data for dialogs"""
threads = []
for account in email_accounts:
emails = await gmail_client.get_threads(account, filter_subjects)
for thread in emails:
if len(thread["messages"]) >= 2:
# Transform correspondence into dialog format
dialog = self.format_as_dialog(thread["messages"])
if anonymize_pii:
dialog = await self.anonymize_pii(dialog)
threads.append(dialog)
return threads
async def collect_from_tickets(
self,
jira_project: str,
status: str = "Done",
limit: int = 5000,
) -> list[dict]:
"""Resolved tickets as Q&A pairs"""
tickets = await jira_client.get_issues(
jql=f"project={jira_project} AND status={status}",
fields=["summary", "description", "comments", "resolution"],
limit=limit,
)
qa_pairs = []
for ticket in tickets:
if ticket.get("comments"):
qa_pairs.append({
"question": f"{ticket['summary']}\n{ticket.get('description', '')[:500]}",
"answer": self.extract_resolution(ticket),
"source": "jira",
"ticket_id": ticket["id"],
})
return qa_pairs
Preparing Data for Fine-tuning
class FinetuningDatasetBuilder:
async def build_instruction_dataset(
self,
raw_docs: list[dict],
qa_pairs: list[dict],
target_format: str = "openai", # "openai", "alpaca", "sharegpt"
) -> list[dict]:
dataset = []
# Generate Q&A from documents via LLM
for doc in raw_docs:
qa_from_doc = await self.generate_qa_from_document(doc["content"])
for qa in qa_from_doc:
if target_format == "openai":
dataset.append({
"messages": [
{"role": "system", "content": "You are a corporate assistant. Answer employee questions."},
{"role": "user", "content": qa["question"]},
{"role": "assistant", "content": qa["answer"]},
]
})
# From tickets — ready pairs
for qa in qa_pairs:
if target_format == "openai":
dataset.append({
"messages": [
{"role": "system", "content": "You are a technical support assistant."},
{"role": "user", "content": qa["question"]},
{"role": "assistant", "content": qa["answer"]},
]
})
# Deduplication and filtering
dataset = self.deduplicate(dataset)
dataset = self.filter_quality(dataset, min_answer_length=50)
return dataset
async def generate_qa_from_document(self, document_text: str) -> list[dict]:
"""Generate Q&A pairs from document"""
response = await openai_client.chat.completions.create(
model="gpt-4o-mini",
messages=[{
"role": "user",
"content": f"""Create 5-10 questions and answers from the following document.
Questions should be like real employees would ask them.
Answers should be complete and accurate.
Document:
{document_text[:3000]}
Return JSON: [{{"question": "...", "answer": "..."}}]"""
}],
)
return json.loads(response.choices[0].message.content)
def filter_quality(self, dataset: list[dict], min_answer_length: int) -> list[dict]:
"""Filter low-quality data"""
filtered = []
for item in dataset:
messages = item.get("messages", [])
assistant_msg = next((m for m in messages if m["role"] == "assistant"), None)
if assistant_msg and len(assistant_msg["content"]) >= min_answer_length:
filtered.append(item)
return filtered
Hybrid Architecture: Fine-tune + RAG
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from qdrant_client import QdrantClient
class HybridCorporateAgent:
"""Combines fine-tuned model with company style and RAG with up-to-date knowledge"""
def __init__(self):
# Fine-tuned model knows company style and terminology
self.finetuned_client = OpenAI(base_url="http://vllm-server:8000/v1")
self.finetuned_model = "company-assistant-ft-v2"
# RAG for up-to-date documents
self.embed_model = SentenceTransformer("BAAI/bge-m3")
self.vector_db = QdrantClient(host="qdrant-server")
async def answer(self, question: str, user_context: dict = None) -> dict:
# Step 1: Find relevant documents
query_embedding = self.embed_model.encode(question)
relevant_docs = self.vector_db.search(
collection_name="corporate_docs",
query_vector=query_embedding,
limit=5,
score_threshold=0.6,
query_filter=self.build_access_filter(user_context), # Access rights
)
# Step 2: Form context
context = "\n\n".join([
f"[{doc.payload['title']}]: {doc.payload['content']}"
for doc in relevant_docs
])
# Step 3: Answer with fine-tuned model and RAG context
response = self.finetuned_client.chat.completions.create(
model=self.finetuned_model,
messages=[{
"role": "system",
"content": f"You are a corporate assistant. Use documents as source of truth.\n\nDocuments:\n{context}"
}, {
"role": "user",
"content": question,
}],
temperature=0.1,
)
return {
"answer": response.choices[0].message.content,
"sources": [{"title": d.payload["title"], "score": d.score} for d in relevant_docs],
}
def build_access_filter(self, user_context: dict):
"""Filter by access rights — employee sees only their documents"""
if not user_context:
return None
department = user_context.get("department", "all")
clearance = user_context.get("clearance", "public")
return {
"must": [
{"key": "access_level", "match": {"any": [clearance, "public"]}},
{"key": "departments", "match": {"any": [department, "all"]}},
]
}
Practical Case Study: IT Company, 300 Employees
Training data: 8,000 Confluence pages, 12,000 resolved Jira tickets, 5 years of email correspondence (anonymized).
Process:
- Collection and cleanup: 3 weeks (main time spent on quality filtering)
- Generate synthetic Q&A from Confluence: 45,000 pairs
- Fine-tune GPT-4o-mini: dataset 60,000 examples, 3 epochs
- RAG-index all documents in Qdrant
- Hybrid agent in production
Results:
- Answer accuracy on corporate processes (evaluation on 500 questions): 91% vs 67% with base GPT-4o
- Use of correct corporate terminology: 97% vs 43%
- Answers to "how we do X here": only fine-tuned model answers correctly
- Support tickets reduction: -34%
Timeline
- Collecting and cleaning corporate data: 2–4 weeks
- Generating synthetic Q&A: 1–2 weeks
- Fine-tuning (GPU time): 2–5 days
- RAG indexing and tuning: 1–2 weeks
- Testing and calibration: 2 weeks
- Total: 8–13 weeks







