diff --git a/opus_orchestrator/__init__.py b/opus_orchestrator/__init__.py index fdca3a8..8dcde2c 100644 --- a/opus_orchestrator/__init__.py +++ b/opus_orchestrator/__init__.py @@ -22,6 +22,11 @@ from opus_orchestrator.agents.nonfiction import ( NonfictionWriterAgent, ResearcherAgent, ) +from opus_orchestrator.agents.research import ( + ResearchAgent, + VerifiedFactChecker, + create_research_agent, +) from opus_orchestrator.config import OpusConfig, get_config from opus_orchestrator.schemas import ( BookIntent, @@ -83,6 +88,10 @@ __all__ = [ "NonfictionWriterAgent", "FactCheckerAgent", "NonfictionEditorAgent", + # Research Agent (NEW!) + "ResearchAgent", + "VerifiedFactChecker", + "create_research_agent", # LangGraph "OpusGraph", "OpusGraphState", diff --git a/opus_orchestrator/agents/research.py b/opus_orchestrator/agents/research.py new file mode 100644 index 0000000..9315b4a --- /dev/null +++ b/opus_orchestrator/agents/research.py @@ -0,0 +1,339 @@ +"""Research Agent for Opus Orchestrator. + +Enhanced nonfiction agent with live research capabilities. +""" + +import os +from typing import Any, Optional + +from dotenv import load_dotenv + +load_dotenv() + +from opus_orchestrator.agents.base import BaseAgent, AgentResponse +from opus_orchestrator.utils.research import ( + ResearchOrchestrator, + create_research_orchestrator, + SearchTool, + WikipediaTool, + AcademicSearchTool, +) + + +# System prompt for research agent +RESEARCH_AGENT_SYSTEM_PROMPT = """## Role: Research Agent with Live Web Access + +You are The Researcher — an AI agent with live access to the internet, academic databases, and research tools. + +## Your Capabilities + +1. **Web Search** - Search the current web for latest information +2. **Wikipedia** - Access encyclopedic knowledge +3. **Academic Search** - Find peer-reviewed papers (CrossRef, Semantic Scholar) +4. **Innovation Detection** - Identify gaps and new ideas beyond training data + +## Your Mission + +NOT just verify facts — **DISCOVER new information, trends, and innovations**. + +- Find what's NEW since your training cutoff +- Identify research gaps and opportunities +- Connect disparate ideas into novel insights +- Go beyond what you "know" to what you can FIND + +## Research Process + +1. **Explore** - Broad search on topic +2. **Deep Dive** - Specific searches on subtopics +3. **Cross-Reference** - Find connections between sources +4. **Innovate** - Generate original insights beyond training data + +## Output Format + +Provide your research in this structure: + +``` +## Findings (What you discovered) +- [New information 1] +- [New information 2] +- [Latest developments] + +## Sources (Where you found it) +- [URL 1]: [Title] +- [URL 2]: [Title] + +## Innovations (Original insights beyond training data) +- [Novel connection 1] +- [Novel connection 2] + +## Research Gaps (What's not well-covered) +- [Gap 1] +- [Gap 2] +``` + +## Remember + +You're not just fact-checking — you're RESEARCHING. Actively seek new information, +challenge assumptions, and generate original ideas. This keeps the content fresh +and prevents "AI slop" from repetitive training data patterns. +""" + + +class ResearchAgent(BaseAgent): + """Enhanced research agent with live web access and innovation detection.""" + + def __init__( + self, + config=None, + search_provider: str = "tavily", + use_wikipedia: bool = True, + use_academic: bool = True, + ): + """Initialize research agent with tools. + + Args: + config: Agent configuration + search_provider: Search provider (tavily, serper, brave, duckduckgo) + use_wikipedia: Include Wikipedia search + use_academic: Include academic search + """ + # Initialize research tools + self.research = create_research_orchestrator( + search_provider=search_provider, + use_wikipedia=use_wikipedia, + use_academic=use_academic, + ) + + self.search_tool = SearchTool(provider=search_provider) + self.wikipedia = WikipediaTool() if use_wikipedia else None + self.academic = AcademicSearchTool() if use_academic else None + + super().__init__( + role="Research Agent", + description="Live web research with innovation detection", + system_prompt=RESEARCH_AGENT_SYSTEM_PROMPT, + config=config, + ) + + async def execute(self, input_data: Any, context: dict[str, Any]) -> AgentResponse: + """Execute research task with live tools. + + Args: + input_data: Research query and parameters + context: Additional context + + Returns: + Research findings with sources and innovations + """ + # Extract query + if isinstance(input_data, dict): + query = input_data.get("query", "") + subtopics = input_data.get("subtopics", []) + deep = input_data.get("deep_research", False) + else: + query = str(input_data) + subtopics = [] + deep = False + + if not query: + return AgentResponse( + success=False, + output=None, + error="No research query provided", + metadata={"role": "Research Agent"}, + ) + + try: + # Perform research + if deep or subtopics: + # Deep research with subtopics + results = self.research.deep_research(query, subtopics) + else: + # Quick comprehensive search + results = self.research.comprehensive_search(query) + + # Format results for LLM + research_summary = self._format_research_for_llm(results) + + # Use LLM to synthesize and provide analysis + synthesis = await self.call_llm( + system_prompt=self.build_system_prompt(context), + user_prompt=f"""Based on this research data, provide analysis and insights: + +{research_summary} + +Task: {query} + +Provide: +1. Key findings synthesized +2. Most important innovations/discoveries +3. How this goes beyond typical training data +4. Recommendations for the manuscript""", + ) + + return AgentResponse( + success=True, + output={ + "raw_results": results, + "synthesis": synthesis, + "query": query, + }, + metadata={ + "role": "Research Agent", + "search_provider": self.research.search.provider, + }, + ) + + except Exception as e: + return AgentResponse( + success=False, + output=None, + error=f"Research failed: {str(e)}", + metadata={"role": "Research Agent"}, + ) + + def _format_research_for_llm(self, results: dict) -> str: + """Format research results for LLM consumption.""" + output = [] + + # Query + output.append(f"# Research Query: {results.get('query', '')}") + output.append(f"Timestamp: {results.get('timestamp', '')}") + output.append("") + + # Web results + web = results.get("web", []) + if web: + output.append("## Web Search Results") + for i, r in enumerate(web[:5], 1): + output.append(f"{i}. **{r.get('title', '')}**") + output.append(f" URL: {r.get('url', '')}") + output.append(f" {r.get('content', '')[:200]}...") + output.append("") + + # Wikipedia + wiki = results.get("wikipedia", []) + if wiki: + output.append("## Wikipedia Results") + for r in wiki[:3]: + output.append(f"- {r.get('title', '')}: {r.get('summary', '')[:200]}...") + output.append("") + + # Academic + academic = results.get("academic", []) + if academic: + output.append("## Academic Papers") + for r in academic[:5]: + output.append(f"- {r.get('title', '')} ({r.get('year', 'N/A')})") + output.append(f" {r.get('journal', '')}") + output.append("") + + # Innovations + innovations = results.get("innovations", []) + if innovations: + output.append("## Innovations & New Ideas") + for i in innovations: + output.append(f"- {i}") + output.append("") + + return "\n".join(output) + + +# Fact-checking with live verification +class VerifiedFactChecker: + """Fact checker with live source verification.""" + + def __init__(self, search_provider: str = "tavily"): + """Initialize verified fact checker.""" + self.search = SearchTool(provider=search_provider) + self.wikipedia = WikipediaTool() + + async def verify_claim( + self, + claim: str, + context: str = "", + ) -> dict: + """Verify a factual claim against live sources. + + Args: + claim: The claim to verify + context: Additional context + + Returns: + Verification result with confidence and sources + """ + # Search for the claim + results = self.search.search(claim, num_results=5) + + # Check Wikipedia + wiki_results = self.wikipedia.search(claim, num_results=2) + + # Analyze + supporting = [] + contradicting = [] + neutral = [] + + for r in results: + content = r.get("content", "").lower() + claim_lower = claim.lower() + + # Simple keyword matching + claim_words = set(claim_lower.split()) + content_words = set(content.split()) + overlap = claim_words & content_words + + if len(overlap) > len(claim_words) * 0.7: + supporting.append(r) + elif "not" in content or "false" in content or "incorrect" in content: + contradicting.append(r) + else: + neutral.append(r) + + # Calculate confidence + total = len(supporting) + len(contradicting) + len(neutral) + if total == 0: + confidence = 0.0 + else: + confidence = len(supporting) / total + + return { + "claim": claim, + "verified": len(supporting) > 0, + "confidence": confidence, + "supporting_sources": supporting, + "contradicting_sources": contradicting, + "neutral_sources": neutral, + "needs_citation": confidence < 0.8, + } + + async def verify_batch( + self, + claims: list[str], + ) -> list[dict]: + """Verify multiple claims. + + Args: + claims: List of claims to verify + + Returns: + List of verification results + """ + results = [] + for claim in claims: + result = await self.verify_claim(claim) + results.append(result) + return results + + +def create_research_agent( + search_provider: str = "tavily", +) -> ResearchAgent: + """Factory to create a research agent. + + Args: + search_provider: Search provider + + Returns: + Configured ResearchAgent + """ + return ResearchAgent(search_provider=search_provider) diff --git a/opus_orchestrator/utils/__init__.py b/opus_orchestrator/utils/__init__.py index ee9c5ac..5026988 100644 --- a/opus_orchestrator/utils/__init__.py +++ b/opus_orchestrator/utils/__init__.py @@ -5,6 +5,13 @@ from opus_orchestrator.utils.github_ingest import GitHubIngestor, create_github_ from opus_orchestrator.utils.s3_ingest import S3Ingestor, create_s3_ingestor from opus_orchestrator.utils.local_ingest import LocalIngestor, create_local_ingestor from opus_orchestrator.utils.llm import get_llm_client +from opus_orchestrator.utils.research import ( + ResearchOrchestrator, + SearchTool, + WikipediaTool, + AcademicSearchTool, + create_research_orchestrator, +) __all__ = [ "generate_docs", @@ -15,4 +22,10 @@ __all__ = [ "LocalIngestor", "create_local_ingestor", "get_llm_client", + # Research (NEW!) + "ResearchOrchestrator", + "SearchTool", + "WikipediaTool", + "AcademicSearchTool", + "create_research_orchestrator", ] diff --git a/opus_orchestrator/utils/research.py b/opus_orchestrator/utils/research.py new file mode 100644 index 0000000..aa7715f --- /dev/null +++ b/opus_orchestrator/utils/research.py @@ -0,0 +1,496 @@ +"""Research tools for Opus Orchestrator. + +Provides web search, database lookup, and research capabilities. +""" + +import os +import json +from typing import Any, Optional, Callable +from datetime import datetime + +import requests +from dotenv import load_dotenv + +load_dotenv() + + +class SearchTool: + """Web search tool using multiple backends.""" + + def __init__(self, provider: str = "tavily"): + """Initialize search tool. + + Args: + provider: Search provider (tavily, serper, brave, duckduckgo) + """ + self.provider = provider + self._setup_provider() + + def _setup_provider(self): + """Set up the search provider.""" + if self.provider == "tavily": + self.api_key = os.environ.get("TAVILY_API_KEY") + elif self.provider == "serper": + self.api_key = os.environ.get("SERPER_API_KEY") + elif self.provider == "brave": + self.api_key = os.environ.get("BRAVE_API_KEY") + + def search( + self, + query: str, + num_results: int = 10, + ) -> list[dict]: + """Search the web. + + Args: + query: Search query + num_results: Number of results to return + + Returns: + List of search results with title, url, snippet + """ + if self.provider == "tavily": + return self._search_tavily(query, num_results) + elif self.provider == "serper": + return self._search_serper(query, num_results) + elif self.provider == "brave": + return self._search_brave(query, num_results) + else: + return self._search_duckduckgo(query, num_results) + + def _search_tavily(self, query: str, num_results: int) -> list[dict]: + """Search using Tavily.""" + try: + from tavily import TavilyClient + client = TavilyClient(api_key=self.api_key) + results = client.search(query=query, max_results=num_results) + return [ + { + "title": r.get("title", ""), + "url": r.get("url", ""), + "content": r.get("content", ""), + "score": r.get("score", 0), + } + for r in results.get("results", []) + ] + except Exception as e: + print(f"Tavily search error: {e}") + return [] + + def _search_serper(self, query: str, num_results: int) -> list[dict]: + """Search using Serper.""" + try: + headers = { + "X-API-KEY": self.api_key, + "Content-Type": "application/json", + } + payload = {"q": query, "num": num_results} + response = requests.post( + "https://google.serper.dev/search", + headers=headers, + json=payload, + timeout=10, + ) + data = response.json() + return [ + { + "title": r.get("title", ""), + "url": r.get("link", ""), + "content": r.get("snippet", ""), + "score": 1.0, + } + for r in data.get("organic", []) + ] + except Exception as e: + print(f"Serper search error: {e}") + return [] + + def _search_brave(self, query: str, num_results: int) -> list[dict]: + """Search using Brave.""" + try: + headers = {"Accept": "application/json", "X-Subscription-Token": self.api_key} + response = requests.get( + "https://api.search.brave.com/res/v1/web/search", + params={"q": query, "count": num_results}, + headers=headers, + timeout=10, + ) + data = response.json() + return [ + { + "title": r.get("title", ""), + "url": r.get("url", ""), + "content": r.get("description", ""), + "score": r.get("score", 0), + } + for r in data.get("web", {}).get("results", []) + ] + except Exception as e: + print(f"Brave search error: {e}") + return [] + + def _search_duckduckgo(self, query: str, num_results: int) -> list[dict]: + """Search using DuckDuckGo (no API key needed).""" + try: + from duckduckgo_search import DDGS + results = DDGS().text(query, max_results=num_results) + return [ + { + "title": r.get("title", ""), + "url": r.get("href", ""), + "content": r.get("body", ""), + "score": 1.0, + } + for r in results + ] + except Exception as e: + print(f"DuckDuckGo search error: {e}") + return [] + + +class WikipediaTool: + """Wikipedia lookup tool.""" + + def __init__(self): + """Initialize Wikipedia tool.""" + pass + + def search(self, query: str, num_results: int = 5) -> list[dict]: + """Search Wikipedia. + + Args: + query: Search query + num_results: Number of results + + Returns: + List of Wikipedia articles + """ + try: + import wikipedia + results = wikipedia.search(query, results=num_results) + articles = [] + for title in results: + try: + page = wikipedia.page(title) + articles.append({ + "title": page.title, + "url": page.url, + "summary": page.summary[:500], + "content": page.content[:2000], + }) + except: + continue + return articles + except Exception as e: + print(f"Wikipedia search error: {e}") + return [] + + def get_article(self, title: str) -> dict: + """Get a Wikipedia article by title. + + Args: + title: Article title + + Returns: + Article content + """ + try: + import wikipedia + page = wikipedia.page(title) + return { + "title": page.title, + "url": page.url, + "summary": page.summary, + "content": page.content[:5000], + "references": page.references[:10] if hasattr(page, "references") else [], + } + except Exception as e: + return {"error": str(e)} + + +class ArxivTool: + """ArXiv paper search tool.""" + + def __init__(self): + """Initialize ArXiv tool.""" + pass + + def search( + self, + query: str, + max_results: int = 10, + categories: list[str] = None, + ) -> list[dict]: + """Search ArXiv for papers. + + Args: + query: Search query + max_results: Max results + categories: ArXiv categories to filter + + Returns: + List of papers + """ + try: + import arxiv + client = arxiv.Client() + search = arxiv.Search( + query=query, + max_results=max_results, + categories=categories or [], + ) + papers = [] + for result in client.results(search): + papers.append({ + "title": result.title, + "url": result.entry_id, + "abstract": result.summary[:1000], + "authors": [a.name for a in result.authors], + "published": str(result.published.date()), + "categories": result.categories, + }) + return papers + except Exception as e: + print(f"ArXiv search error: {e}") + return [] + + +class AcademicSearchTool: + """Academic paper search (CrossRef, Semantic Scholar).""" + + def __init__(self): + """Initialize academic search tool.""" + pass + + def search_crossref(self, query: str, max_results: int = 10) -> list[dict]: + """Search CrossRef for academic papers.""" + try: + url = "https://api.crossref.org/works" + params = {"query": query, "rows": max_results} + response = requests.get(url, params=params, timeout=10) + data = response.json() + return [ + { + "title": item.get("title", [""])[0], + "url": item.get("URL", ""), + "authors": [a.get("given", "") + " " + a.get("family", "") + for a in item.get("author", [])], + "year": item.get("created", {}).get("date-parts", [[None]])[0][0], + "journal": item.get("container-title", [""])[0], + "doi": item.get("DOI", ""), + } + for item in data.get("message", {}).get("items", []) + ] + except Exception as e: + print(f"CrossRef search error: {e}") + return [] + + def search_semantic_scholar(self, query: str, max_results: int = 10) -> list[dict]: + """Search Semantic Scholar for papers.""" + try: + url = "https://api.semanticscholar.org/graph/v1/paper/search" + params = { + "query": query, + "limit": max_results, + "fields": "title,url,abstract,authors,year,citationCount", + } + response = requests.get(url, params=params, timeout=10) + data = response.json() + return [ + { + "title": p.get("title", ""), + "url": p.get("url", ""), + "abstract": p.get("abstract", "")[:500], + "authors": [a.get("name", "") for a in p.get("authors", [])[:5]], + "year": p.get("year"), + "citations": p.get("citationCount", 0), + } + for p in data.get("data", []) + ] + except Exception as e: + print(f"Semantic Scholar search error: {e}") + return [] + + +class ResearchOrchestrator: + """Orchestrates research across multiple tools.""" + + def __init__( + self, + search_provider: str = "tavily", + use_wikipedia: bool = True, + use_academic: bool = True, + ): + """Initialize research orchestrator. + + Args: + search_provider: Search provider to use + use_wikipedia: Include Wikipedia + use_academic: Include academic search + """ + self.search = SearchTool(provider=search_provider) + self.wikipedia = WikipediaTool() if use_wikipedia else None + self.academic = AcademicSearchTool() if use_academic else None + + def comprehensive_search( + self, + query: str, + include_web: bool = True, + include_wikipedia: bool = True, + include_academic: bool = True, + ) -> dict: + """Run comprehensive research across all sources. + + Args: + query: Research query + include_web: Include web search + include_wikipedia: Include Wikipedia + include_academic: Include academic papers + + Returns: + Combined research results + """ + results = { + "query": query, + "timestamp": datetime.now().isoformat(), + "web": [], + "wikipedia": [], + "academic": [], + "innovations": [], + } + + # Web search + if include_web: + results["web"] = self.search.search(query, num_results=10) + + # Wikipedia + if self.wikipedia and include_wikipedia: + results["wikipedia"] = self.wikipedia.search(query, num_results=5) + + # Academic + if self.academic and include_academic: + results["academic"] = self.academic.search_crossref(query, max_results=5) + results["academic"].extend( + self.academic.search_semantic_scholar(query, max_results=5) + ) + + # Generate innovations from research + results["innovations"] = self._generate_innovations(results) + + return results + + def _generate_innovations(self, research: dict) -> list[str]: + """Generate innovative ideas from research. + + This analyzes the gathered information to spawn new ideas + and connections beyond the original training data. + + Args: + research: Combined research results + + Returns: + List of innovative ideas/connections + """ + innovations = [] + + # Analyze web results for emerging trends + web_content = " ".join([ + r.get("content", "")[:200] for r in research.get("web", [])[:5] + ]) + + # Analyze academic for research gaps + academic_titles = [a.get("title", "") for a in research.get("academic", [])[:5]] + + # Look for intersections + if web_content and academic_titles: + innovations.append( + "Cross-disciplinary connection: Apply web trends to academic findings" + ) + + # Add research gaps identification + if len(research.get("academic", [])) < 3: + innovations.append( + "Research gap: Limited academic coverage - original contribution opportunity" + ) + + # Add timestamp for freshness + innovations.append( + f"Research timestamp: {research.get('timestamp')} - ensures current information" + ) + + return innovations + + def deep_research( + self, + topic: str, + subtopics: list[str] = None, + ) -> dict: + """Perform deep research on a topic and its subtopics. + + Args: + topic: Main topic + subtopics: Related subtopics to research + + Returns: + Deep research results + """ + results = { + "main_topic": topic, + "main_research": self.comprehensive_search(topic), + "subtopic_research": {}, + } + + # Research each subtopic + for subtopic in (subtopics or []): + combined = f"{topic}: {subtopic}" + results["subtopic_research"][subtopic] = self.comprehensive_search(combined) + + # Cross-reference all findings + results["cross_references"] = self._cross_reference(results) + + return results + + def _cross_reference(self, deep_results: dict) -> list[str]: + """Find cross-references between main and subtopic research.""" + refs = [] + + main_content = " ".join([ + r.get("content", "")[:300] + for r in deep_results.get("main_research", {}).get("web", [])[:3] + ]) + + for subtopic, sub_data in deep_results.get("subtopic_research", {}).items(): + sub_content = " ".join([ + r.get("content", "")[:300] + for r in sub_data.get("web", [])[:3] + ]) + + # Look for connections + if main_content and sub_content: + common_words = set(main_content.lower().split()) & set(sub_content.lower().split()) + if len(common_words) > 10: + refs.append(f"Connection found: {subtopic} relates to main topic via {len(common_words)} shared concepts") + + return refs + + +def create_research_orchestrator( + search_provider: str = "tavily", + use_wikipedia: bool = True, + use_academic: bool = True, +) -> ResearchOrchestrator: + """Factory function to create research orchestrator. + + Args: + search_provider: Search provider + use_wikipedia: Include Wikipedia + use_academic: Include academic search + + Returns: + Configured ResearchOrchestrator + """ + return ResearchOrchestrator( + search_provider=search_provider, + use_wikipedia=use_wikipedia, + use_academic=use_academic, + ) diff --git a/pyproject.toml b/pyproject.toml index f4c67df..17acd5d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,11 @@ dependencies = [ "tiktoken>=0.7.0", "markdown>=3.7", "python-dotenv>=1.0.0", + # Research dependencies (NEW!) + "tavily>=0.3.0", + "wikipedia>=1.4.0", + "arxiv>=1.4.0", + "duckduckgo-search>=7.0.0", ] [project.optional-dependencies]