Add live research capabilities with innovation detection

Research Tools: - SearchTool: Multiple backends (Tavily, Serper, Brave, DuckDuckGo) - WikipediaTool: Wikipedia lookup - AcademicSearchTool: CrossRef, Semantic Scholar - ResearchOrchestrator: Comprehensive multi-source research ResearchAgent: - NOT just fact-checking - actively discovers NEW information - Identifies trends beyond training data cutoff - Generates innovations from cross-referencing sources - Deep research with subtopics VerifiedFactChecker: - Live claim verification against web sources - Confidence scoring - Citation needed detection Dependencies added: tavily, wikipedia, arxiv, duckduckgo-search
2026-03-13 05:03:52 +00:00
parent 6766e93c3d
commit 8cb29889cc
5 changed files with 862 additions and 0 deletions
@@ -22,6 +22,11 @@ from opus_orchestrator.agents.nonfiction import (
    NonfictionWriterAgent,
    ResearcherAgent,
 )
+from opus_orchestrator.agents.research import (
+    ResearchAgent,
+    VerifiedFactChecker,
+    create_research_agent,
+)
 from opus_orchestrator.config import OpusConfig, get_config
 from opus_orchestrator.schemas import (
    BookIntent,
@@ -83,6 +88,10 @@ __all__ = [
    "NonfictionWriterAgent",
    "FactCheckerAgent",
    "NonfictionEditorAgent",
+    # Research Agent (NEW!)
+    "ResearchAgent",
+    "VerifiedFactChecker",
+    "create_research_agent",
    # LangGraph
    "OpusGraph",
    "OpusGraphState",
@@ -0,0 +1,339 @@
+"""Research Agent for Opus Orchestrator.
+
+Enhanced nonfiction agent with live research capabilities.
+"""
+
+import os
+from typing import Any, Optional
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+from opus_orchestrator.agents.base import BaseAgent, AgentResponse
+from opus_orchestrator.utils.research import (
+    ResearchOrchestrator,
+    create_research_orchestrator,
+    SearchTool,
+    WikipediaTool,
+    AcademicSearchTool,
+)
+
+
+# System prompt for research agent
+RESEARCH_AGENT_SYSTEM_PROMPT = """## Role: Research Agent with Live Web Access
+
+You are The Researcher — an AI agent with live access to the internet, academic databases, and research tools.
+
+## Your Capabilities
+
+1. **Web Search** - Search the current web for latest information
+2. **Wikipedia** - Access encyclopedic knowledge
+3. **Academic Search** - Find peer-reviewed papers (CrossRef, Semantic Scholar)
+4. **Innovation Detection** - Identify gaps and new ideas beyond training data
+
+## Your Mission
+
+NOT just verify facts — **DISCOVER new information, trends, and innovations**.
+
+- Find what's NEW since your training cutoff
+- Identify research gaps and opportunities  
+- Connect disparate ideas into novel insights
+- Go beyond what you "know" to what you can FIND
+
+## Research Process
+
+1. **Explore** - Broad search on topic
+2. **Deep Dive** - Specific searches on subtopics
+3. **Cross-Reference** - Find connections between sources
+4. **Innovate** - Generate original insights beyond training data
+
+## Output Format
+
+Provide your research in this structure:
+
+```
+## Findings (What you discovered)
+- [New information 1]
+- [New information 2]
+- [Latest developments]
+
+## Sources (Where you found it)
+- [URL 1]: [Title]
+- [URL 2]: [Title]
+
+## Innovations (Original insights beyond training data)
+- [Novel connection 1]
+- [Novel connection 2]
+
+## Research Gaps (What's not well-covered)
+- [Gap 1]
+- [Gap 2]
+```
+
+## Remember
+
+You're not just fact-checking — you're RESEARCHING. Actively seek new information, 
+challenge assumptions, and generate original ideas. This keeps the content fresh 
+and prevents "AI slop" from repetitive training data patterns.
+"""
+
+
+class ResearchAgent(BaseAgent):
+    """Enhanced research agent with live web access and innovation detection."""
+    
+    def __init__(
+        self,
+        config=None,
+        search_provider: str = "tavily",
+        use_wikipedia: bool = True,
+        use_academic: bool = True,
+    ):
+        """Initialize research agent with tools.
+        
+        Args:
+            config: Agent configuration
+            search_provider: Search provider (tavily, serper, brave, duckduckgo)
+            use_wikipedia: Include Wikipedia search
+            use_academic: Include academic search
+        """
+        # Initialize research tools
+        self.research = create_research_orchestrator(
+            search_provider=search_provider,
+            use_wikipedia=use_wikipedia,
+            use_academic=use_academic,
+        )
+        
+        self.search_tool = SearchTool(provider=search_provider)
+        self.wikipedia = WikipediaTool() if use_wikipedia else None
+        self.academic = AcademicSearchTool() if use_academic else None
+        
+        super().__init__(
+            role="Research Agent",
+            description="Live web research with innovation detection",
+            system_prompt=RESEARCH_AGENT_SYSTEM_PROMPT,
+            config=config,
+        )
+
+    async def execute(self, input_data: Any, context: dict[str, Any]) -> AgentResponse:
+        """Execute research task with live tools.
+        
+        Args:
+            input_data: Research query and parameters
+            context: Additional context
+            
+        Returns:
+            Research findings with sources and innovations
+        """
+        # Extract query
+        if isinstance(input_data, dict):
+            query = input_data.get("query", "")
+            subtopics = input_data.get("subtopics", [])
+            deep = input_data.get("deep_research", False)
+        else:
+            query = str(input_data)
+            subtopics = []
+            deep = False
+        
+        if not query:
+            return AgentResponse(
+                success=False,
+                output=None,
+                error="No research query provided",
+                metadata={"role": "Research Agent"},
+            )
+        
+        try:
+            # Perform research
+            if deep or subtopics:
+                # Deep research with subtopics
+                results = self.research.deep_research(query, subtopics)
+            else:
+                # Quick comprehensive search
+                results = self.research.comprehensive_search(query)
+            
+            # Format results for LLM
+            research_summary = self._format_research_for_llm(results)
+            
+            # Use LLM to synthesize and provide analysis
+            synthesis = await self.call_llm(
+                system_prompt=self.build_system_prompt(context),
+                user_prompt=f"""Based on this research data, provide analysis and insights:
+
+{research_summary}
+
+Task: {query}
+
+Provide:
+1. Key findings synthesized
+2. Most important innovations/discoveries
+3. How this goes beyond typical training data
+4. Recommendations for the manuscript""",
+            )
+            
+            return AgentResponse(
+                success=True,
+                output={
+                    "raw_results": results,
+                    "synthesis": synthesis,
+                    "query": query,
+                },
+                metadata={
+                    "role": "Research Agent",
+                    "search_provider": self.research.search.provider,
+                },
+            )
+            
+        except Exception as e:
+            return AgentResponse(
+                success=False,
+                output=None,
+                error=f"Research failed: {str(e)}",
+                metadata={"role": "Research Agent"},
+            )
+    
+    def _format_research_for_llm(self, results: dict) -> str:
+        """Format research results for LLM consumption."""
+        output = []
+        
+        # Query
+        output.append(f"# Research Query: {results.get('query', '')}")
+        output.append(f"Timestamp: {results.get('timestamp', '')}")
+        output.append("")
+        
+        # Web results
+        web = results.get("web", [])
+        if web:
+            output.append("## Web Search Results")
+            for i, r in enumerate(web[:5], 1):
+                output.append(f"{i}. **{r.get('title', '')}**")
+                output.append(f"   URL: {r.get('url', '')}")
+                output.append(f"   {r.get('content', '')[:200]}...")
+                output.append("")
+        
+        # Wikipedia
+        wiki = results.get("wikipedia", [])
+        if wiki:
+            output.append("## Wikipedia Results")
+            for r in wiki[:3]:
+                output.append(f"- {r.get('title', '')}: {r.get('summary', '')[:200]}...")
+            output.append("")
+        
+        # Academic
+        academic = results.get("academic", [])
+        if academic:
+            output.append("## Academic Papers")
+            for r in academic[:5]:
+                output.append(f"- {r.get('title', '')} ({r.get('year', 'N/A')})")
+                output.append(f"  {r.get('journal', '')}")
+            output.append("")
+        
+        # Innovations
+        innovations = results.get("innovations", [])
+        if innovations:
+            output.append("## Innovations & New Ideas")
+            for i in innovations:
+                output.append(f"- {i}")
+            output.append("")
+        
+        return "\n".join(output)
+
+
+# Fact-checking with live verification
+class VerifiedFactChecker:
+    """Fact checker with live source verification."""
+    
+    def __init__(self, search_provider: str = "tavily"):
+        """Initialize verified fact checker."""
+        self.search = SearchTool(provider=search_provider)
+        self.wikipedia = WikipediaTool()
+    
+    async def verify_claim(
+        self,
+        claim: str,
+        context: str = "",
+    ) -> dict:
+        """Verify a factual claim against live sources.
+        
+        Args:
+            claim: The claim to verify
+            context: Additional context
+            
+        Returns:
+            Verification result with confidence and sources
+        """
+        # Search for the claim
+        results = self.search.search(claim, num_results=5)
+        
+        # Check Wikipedia
+        wiki_results = self.wikipedia.search(claim, num_results=2)
+        
+        # Analyze
+        supporting = []
+        contradicting = []
+        neutral = []
+        
+        for r in results:
+            content = r.get("content", "").lower()
+            claim_lower = claim.lower()
+            
+            # Simple keyword matching
+            claim_words = set(claim_lower.split())
+            content_words = set(content.split())
+            overlap = claim_words & content_words
+            
+            if len(overlap) > len(claim_words) * 0.7:
+                supporting.append(r)
+            elif "not" in content or "false" in content or "incorrect" in content:
+                contradicting.append(r)
+            else:
+                neutral.append(r)
+        
+        # Calculate confidence
+        total = len(supporting) + len(contradicting) + len(neutral)
+        if total == 0:
+            confidence = 0.0
+        else:
+            confidence = len(supporting) / total
+        
+        return {
+            "claim": claim,
+            "verified": len(supporting) > 0,
+            "confidence": confidence,
+            "supporting_sources": supporting,
+            "contradicting_sources": contradicting,
+            "neutral_sources": neutral,
+            "needs_citation": confidence < 0.8,
+        }
+    
+    async def verify_batch(
+        self,
+        claims: list[str],
+    ) -> list[dict]:
+        """Verify multiple claims.
+        
+        Args:
+            claims: List of claims to verify
+            
+        Returns:
+            List of verification results
+        """
+        results = []
+        for claim in claims:
+            result = await self.verify_claim(claim)
+            results.append(result)
+        return results
+
+
+def create_research_agent(
+    search_provider: str = "tavily",
+) -> ResearchAgent:
+    """Factory to create a research agent.
+    
+    Args:
+        search_provider: Search provider
+        
+    Returns:
+        Configured ResearchAgent
+    """
+    return ResearchAgent(search_provider=search_provider)
@@ -5,6 +5,13 @@ from opus_orchestrator.utils.github_ingest import GitHubIngestor, create_github_
 from opus_orchestrator.utils.s3_ingest import S3Ingestor, create_s3_ingestor
 from opus_orchestrator.utils.local_ingest import LocalIngestor, create_local_ingestor
 from opus_orchestrator.utils.llm import get_llm_client
+from opus_orchestrator.utils.research import (
+    ResearchOrchestrator,
+    SearchTool,
+    WikipediaTool,
+    AcademicSearchTool,
+    create_research_orchestrator,
+)

 __all__ = [
    "generate_docs",
@@ -15,4 +22,10 @@ __all__ = [
    "LocalIngestor",
    "create_local_ingestor",
    "get_llm_client",
+    # Research (NEW!)
+    "ResearchOrchestrator",
+    "SearchTool",
+    "WikipediaTool",
+    "AcademicSearchTool",
+    "create_research_orchestrator",
 ]
@@ -0,0 +1,496 @@
+"""Research tools for Opus Orchestrator.
+
+Provides web search, database lookup, and research capabilities.
+"""
+
+import os
+import json
+from typing import Any, Optional, Callable
+from datetime import datetime
+
+import requests
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+class SearchTool:
+    """Web search tool using multiple backends."""
+    
+    def __init__(self, provider: str = "tavily"):
+        """Initialize search tool.
+        
+        Args:
+            provider: Search provider (tavily, serper, brave, duckduckgo)
+        """
+        self.provider = provider
+        self._setup_provider()
+    
+    def _setup_provider(self):
+        """Set up the search provider."""
+        if self.provider == "tavily":
+            self.api_key = os.environ.get("TAVILY_API_KEY")
+        elif self.provider == "serper":
+            self.api_key = os.environ.get("SERPER_API_KEY")
+        elif self.provider == "brave":
+            self.api_key = os.environ.get("BRAVE_API_KEY")
+    
+    def search(
+        self,
+        query: str,
+        num_results: int = 10,
+    ) -> list[dict]:
+        """Search the web.
+        
+        Args:
+            query: Search query
+            num_results: Number of results to return
+            
+        Returns:
+            List of search results with title, url, snippet
+        """
+        if self.provider == "tavily":
+            return self._search_tavily(query, num_results)
+        elif self.provider == "serper":
+            return self._search_serper(query, num_results)
+        elif self.provider == "brave":
+            return self._search_brave(query, num_results)
+        else:
+            return self._search_duckduckgo(query, num_results)
+    
+    def _search_tavily(self, query: str, num_results: int) -> list[dict]:
+        """Search using Tavily."""
+        try:
+            from tavily import TavilyClient
+            client = TavilyClient(api_key=self.api_key)
+            results = client.search(query=query, max_results=num_results)
+            return [
+                {
+                    "title": r.get("title", ""),
+                    "url": r.get("url", ""),
+                    "content": r.get("content", ""),
+                    "score": r.get("score", 0),
+                }
+                for r in results.get("results", [])
+            ]
+        except Exception as e:
+            print(f"Tavily search error: {e}")
+            return []
+    
+    def _search_serper(self, query: str, num_results: int) -> list[dict]:
+        """Search using Serper."""
+        try:
+            headers = {
+                "X-API-KEY": self.api_key,
+                "Content-Type": "application/json",
+            }
+            payload = {"q": query, "num": num_results}
+            response = requests.post(
+                "https://google.serper.dev/search",
+                headers=headers,
+                json=payload,
+                timeout=10,
+            )
+            data = response.json()
+            return [
+                {
+                    "title": r.get("title", ""),
+                    "url": r.get("link", ""),
+                    "content": r.get("snippet", ""),
+                    "score": 1.0,
+                }
+                for r in data.get("organic", [])
+            ]
+        except Exception as e:
+            print(f"Serper search error: {e}")
+            return []
+    
+    def _search_brave(self, query: str, num_results: int) -> list[dict]:
+        """Search using Brave."""
+        try:
+            headers = {"Accept": "application/json", "X-Subscription-Token": self.api_key}
+            response = requests.get(
+                "https://api.search.brave.com/res/v1/web/search",
+                params={"q": query, "count": num_results},
+                headers=headers,
+                timeout=10,
+            )
+            data = response.json()
+            return [
+                {
+                    "title": r.get("title", ""),
+                    "url": r.get("url", ""),
+                    "content": r.get("description", ""),
+                    "score": r.get("score", 0),
+                }
+                for r in data.get("web", {}).get("results", [])
+            ]
+        except Exception as e:
+            print(f"Brave search error: {e}")
+            return []
+    
+    def _search_duckduckgo(self, query: str, num_results: int) -> list[dict]:
+        """Search using DuckDuckGo (no API key needed)."""
+        try:
+            from duckduckgo_search import DDGS
+            results = DDGS().text(query, max_results=num_results)
+            return [
+                {
+                    "title": r.get("title", ""),
+                    "url": r.get("href", ""),
+                    "content": r.get("body", ""),
+                    "score": 1.0,
+                }
+                for r in results
+            ]
+        except Exception as e:
+            print(f"DuckDuckGo search error: {e}")
+            return []
+
+
+class WikipediaTool:
+    """Wikipedia lookup tool."""
+    
+    def __init__(self):
+        """Initialize Wikipedia tool."""
+        pass
+    
+    def search(self, query: str, num_results: int = 5) -> list[dict]:
+        """Search Wikipedia.
+        
+        Args:
+            query: Search query
+            num_results: Number of results
+            
+        Returns:
+            List of Wikipedia articles
+        """
+        try:
+            import wikipedia
+            results = wikipedia.search(query, results=num_results)
+            articles = []
+            for title in results:
+                try:
+                    page = wikipedia.page(title)
+                    articles.append({
+                        "title": page.title,
+                        "url": page.url,
+                        "summary": page.summary[:500],
+                        "content": page.content[:2000],
+                    })
+                except:
+                    continue
+            return articles
+        except Exception as e:
+            print(f"Wikipedia search error: {e}")
+            return []
+    
+    def get_article(self, title: str) -> dict:
+        """Get a Wikipedia article by title.
+        
+        Args:
+            title: Article title
+            
+        Returns:
+            Article content
+        """
+        try:
+            import wikipedia
+            page = wikipedia.page(title)
+            return {
+                "title": page.title,
+                "url": page.url,
+                "summary": page.summary,
+                "content": page.content[:5000],
+                "references": page.references[:10] if hasattr(page, "references") else [],
+            }
+        except Exception as e:
+            return {"error": str(e)}
+
+
+class ArxivTool:
+    """ArXiv paper search tool."""
+    
+    def __init__(self):
+        """Initialize ArXiv tool."""
+        pass
+    
+    def search(
+        self,
+        query: str,
+        max_results: int = 10,
+        categories: list[str] = None,
+    ) -> list[dict]:
+        """Search ArXiv for papers.
+        
+        Args:
+            query: Search query
+            max_results: Max results
+            categories: ArXiv categories to filter
+            
+        Returns:
+            List of papers
+        """
+        try:
+            import arxiv
+            client = arxiv.Client()
+            search = arxiv.Search(
+                query=query,
+                max_results=max_results,
+                categories=categories or [],
+            )
+            papers = []
+            for result in client.results(search):
+                papers.append({
+                    "title": result.title,
+                    "url": result.entry_id,
+                    "abstract": result.summary[:1000],
+                    "authors": [a.name for a in result.authors],
+                    "published": str(result.published.date()),
+                    "categories": result.categories,
+                })
+            return papers
+        except Exception as e:
+            print(f"ArXiv search error: {e}")
+            return []
+
+
+class AcademicSearchTool:
+    """Academic paper search (CrossRef, Semantic Scholar)."""
+    
+    def __init__(self):
+        """Initialize academic search tool."""
+        pass
+    
+    def search_crossref(self, query: str, max_results: int = 10) -> list[dict]:
+        """Search CrossRef for academic papers."""
+        try:
+            url = "https://api.crossref.org/works"
+            params = {"query": query, "rows": max_results}
+            response = requests.get(url, params=params, timeout=10)
+            data = response.json()
+            return [
+                {
+                    "title": item.get("title", [""])[0],
+                    "url": item.get("URL", ""),
+                    "authors": [a.get("given", "") + " " + a.get("family", "") 
+                               for a in item.get("author", [])],
+                    "year": item.get("created", {}).get("date-parts", [[None]])[0][0],
+                    "journal": item.get("container-title", [""])[0],
+                    "doi": item.get("DOI", ""),
+                }
+                for item in data.get("message", {}).get("items", [])
+            ]
+        except Exception as e:
+            print(f"CrossRef search error: {e}")
+            return []
+    
+    def search_semantic_scholar(self, query: str, max_results: int = 10) -> list[dict]:
+        """Search Semantic Scholar for papers."""
+        try:
+            url = "https://api.semanticscholar.org/graph/v1/paper/search"
+            params = {
+                "query": query,
+                "limit": max_results,
+                "fields": "title,url,abstract,authors,year,citationCount",
+            }
+            response = requests.get(url, params=params, timeout=10)
+            data = response.json()
+            return [
+                {
+                    "title": p.get("title", ""),
+                    "url": p.get("url", ""),
+                    "abstract": p.get("abstract", "")[:500],
+                    "authors": [a.get("name", "") for a in p.get("authors", [])[:5]],
+                    "year": p.get("year"),
+                    "citations": p.get("citationCount", 0),
+                }
+                for p in data.get("data", [])
+            ]
+        except Exception as e:
+            print(f"Semantic Scholar search error: {e}")
+            return []
+
+
+class ResearchOrchestrator:
+    """Orchestrates research across multiple tools."""
+    
+    def __init__(
+        self,
+        search_provider: str = "tavily",
+        use_wikipedia: bool = True,
+        use_academic: bool = True,
+    ):
+        """Initialize research orchestrator.
+        
+        Args:
+            search_provider: Search provider to use
+            use_wikipedia: Include Wikipedia
+            use_academic: Include academic search
+        """
+        self.search = SearchTool(provider=search_provider)
+        self.wikipedia = WikipediaTool() if use_wikipedia else None
+        self.academic = AcademicSearchTool() if use_academic else None
+    
+    def comprehensive_search(
+        self,
+        query: str,
+        include_web: bool = True,
+        include_wikipedia: bool = True,
+        include_academic: bool = True,
+    ) -> dict:
+        """Run comprehensive research across all sources.
+        
+        Args:
+            query: Research query
+            include_web: Include web search
+            include_wikipedia: Include Wikipedia
+            include_academic: Include academic papers
+            
+        Returns:
+            Combined research results
+        """
+        results = {
+            "query": query,
+            "timestamp": datetime.now().isoformat(),
+            "web": [],
+            "wikipedia": [],
+            "academic": [],
+            "innovations": [],
+        }
+        
+        # Web search
+        if include_web:
+            results["web"] = self.search.search(query, num_results=10)
+        
+        # Wikipedia
+        if self.wikipedia and include_wikipedia:
+            results["wikipedia"] = self.wikipedia.search(query, num_results=5)
+        
+        # Academic
+        if self.academic and include_academic:
+            results["academic"] = self.academic.search_crossref(query, max_results=5)
+            results["academic"].extend(
+                self.academic.search_semantic_scholar(query, max_results=5)
+            )
+        
+        # Generate innovations from research
+        results["innovations"] = self._generate_innovations(results)
+        
+        return results
+    
+    def _generate_innovations(self, research: dict) -> list[str]:
+        """Generate innovative ideas from research.
+        
+        This analyzes the gathered information to spawn new ideas
+        and connections beyond the original training data.
+        
+        Args:
+            research: Combined research results
+            
+        Returns:
+            List of innovative ideas/connections
+        """
+        innovations = []
+        
+        # Analyze web results for emerging trends
+        web_content = " ".join([
+            r.get("content", "")[:200] for r in research.get("web", [])[:5]
+        ])
+        
+        # Analyze academic for research gaps
+        academic_titles = [a.get("title", "") for a in research.get("academic", [])[:5]]
+        
+        # Look for intersections
+        if web_content and academic_titles:
+            innovations.append(
+                "Cross-disciplinary connection: Apply web trends to academic findings"
+            )
+        
+        # Add research gaps identification
+        if len(research.get("academic", [])) < 3:
+            innovations.append(
+                "Research gap: Limited academic coverage - original contribution opportunity"
+            )
+        
+        # Add timestamp for freshness
+        innovations.append(
+            f"Research timestamp: {research.get('timestamp')} - ensures current information"
+        )
+        
+        return innovations
+    
+    def deep_research(
+        self,
+        topic: str,
+        subtopics: list[str] = None,
+    ) -> dict:
+        """Perform deep research on a topic and its subtopics.
+        
+        Args:
+            topic: Main topic
+            subtopics: Related subtopics to research
+            
+        Returns:
+            Deep research results
+        """
+        results = {
+            "main_topic": topic,
+            "main_research": self.comprehensive_search(topic),
+            "subtopic_research": {},
+        }
+        
+        # Research each subtopic
+        for subtopic in (subtopics or []):
+            combined = f"{topic}: {subtopic}"
+            results["subtopic_research"][subtopic] = self.comprehensive_search(combined)
+        
+        # Cross-reference all findings
+        results["cross_references"] = self._cross_reference(results)
+        
+        return results
+    
+    def _cross_reference(self, deep_results: dict) -> list[str]:
+        """Find cross-references between main and subtopic research."""
+        refs = []
+        
+        main_content = " ".join([
+            r.get("content", "")[:300] 
+            for r in deep_results.get("main_research", {}).get("web", [])[:3]
+        ])
+        
+        for subtopic, sub_data in deep_results.get("subtopic_research", {}).items():
+            sub_content = " ".join([
+                r.get("content", "")[:300]
+                for r in sub_data.get("web", [])[:3]
+            ])
+            
+            # Look for connections
+            if main_content and sub_content:
+                common_words = set(main_content.lower().split()) & set(sub_content.lower().split())
+                if len(common_words) > 10:
+                    refs.append(f"Connection found: {subtopic} relates to main topic via {len(common_words)} shared concepts")
+        
+        return refs
+
+
+def create_research_orchestrator(
+    search_provider: str = "tavily",
+    use_wikipedia: bool = True,
+    use_academic: bool = True,
+) -> ResearchOrchestrator:
+    """Factory function to create research orchestrator.
+    
+    Args:
+        search_provider: Search provider
+        use_wikipedia: Include Wikipedia
+        use_academic: Include academic search
+        
+    Returns:
+        Configured ResearchOrchestrator
+    """
+    return ResearchOrchestrator(
+        search_provider=search_provider,
+        use_wikipedia=use_wikipedia,
+        use_academic=use_academic,
+    )
@@ -30,6 +30,11 @@ dependencies = [
    "tiktoken>=0.7.0",
    "markdown>=3.7",
    "python-dotenv>=1.0.0",
+    # Research dependencies (NEW!)
+    "tavily>=0.3.0",
+    "wikipedia>=1.4.0",
+    "arxiv>=1.4.0",
+    "duckduckgo-search>=7.0.0",
 ]

 [project.optional-dependencies]