Add live research capabilities with innovation detection

Research Tools:
- SearchTool: Multiple backends (Tavily, Serper, Brave, DuckDuckGo)
- WikipediaTool: Wikipedia lookup
- AcademicSearchTool: CrossRef, Semantic Scholar
- ResearchOrchestrator: Comprehensive multi-source research

ResearchAgent:
- NOT just fact-checking - actively discovers NEW information
- Identifies trends beyond training data cutoff
- Generates innovations from cross-referencing sources
- Deep research with subtopics

VerifiedFactChecker:
- Live claim verification against web sources
- Confidence scoring
- Citation needed detection

Dependencies added: tavily, wikipedia, arxiv, duckduckgo-search
This commit is contained in:
2026-03-13 05:03:52 +00:00
parent 6766e93c3d
commit 8cb29889cc
5 changed files with 862 additions and 0 deletions
+9
View File
@@ -22,6 +22,11 @@ from opus_orchestrator.agents.nonfiction import (
NonfictionWriterAgent,
ResearcherAgent,
)
from opus_orchestrator.agents.research import (
ResearchAgent,
VerifiedFactChecker,
create_research_agent,
)
from opus_orchestrator.config import OpusConfig, get_config
from opus_orchestrator.schemas import (
BookIntent,
@@ -83,6 +88,10 @@ __all__ = [
"NonfictionWriterAgent",
"FactCheckerAgent",
"NonfictionEditorAgent",
# Research Agent (NEW!)
"ResearchAgent",
"VerifiedFactChecker",
"create_research_agent",
# LangGraph
"OpusGraph",
"OpusGraphState",
+339
View File
@@ -0,0 +1,339 @@
"""Research Agent for Opus Orchestrator.
Enhanced nonfiction agent with live research capabilities.
"""
import os
from typing import Any, Optional
from dotenv import load_dotenv
load_dotenv()
from opus_orchestrator.agents.base import BaseAgent, AgentResponse
from opus_orchestrator.utils.research import (
ResearchOrchestrator,
create_research_orchestrator,
SearchTool,
WikipediaTool,
AcademicSearchTool,
)
# System prompt for research agent
RESEARCH_AGENT_SYSTEM_PROMPT = """## Role: Research Agent with Live Web Access
You are The Researcher — an AI agent with live access to the internet, academic databases, and research tools.
## Your Capabilities
1. **Web Search** - Search the current web for latest information
2. **Wikipedia** - Access encyclopedic knowledge
3. **Academic Search** - Find peer-reviewed papers (CrossRef, Semantic Scholar)
4. **Innovation Detection** - Identify gaps and new ideas beyond training data
## Your Mission
NOT just verify facts — **DISCOVER new information, trends, and innovations**.
- Find what's NEW since your training cutoff
- Identify research gaps and opportunities
- Connect disparate ideas into novel insights
- Go beyond what you "know" to what you can FIND
## Research Process
1. **Explore** - Broad search on topic
2. **Deep Dive** - Specific searches on subtopics
3. **Cross-Reference** - Find connections between sources
4. **Innovate** - Generate original insights beyond training data
## Output Format
Provide your research in this structure:
```
## Findings (What you discovered)
- [New information 1]
- [New information 2]
- [Latest developments]
## Sources (Where you found it)
- [URL 1]: [Title]
- [URL 2]: [Title]
## Innovations (Original insights beyond training data)
- [Novel connection 1]
- [Novel connection 2]
## Research Gaps (What's not well-covered)
- [Gap 1]
- [Gap 2]
```
## Remember
You're not just fact-checking — you're RESEARCHING. Actively seek new information,
challenge assumptions, and generate original ideas. This keeps the content fresh
and prevents "AI slop" from repetitive training data patterns.
"""
class ResearchAgent(BaseAgent):
"""Enhanced research agent with live web access and innovation detection."""
def __init__(
self,
config=None,
search_provider: str = "tavily",
use_wikipedia: bool = True,
use_academic: bool = True,
):
"""Initialize research agent with tools.
Args:
config: Agent configuration
search_provider: Search provider (tavily, serper, brave, duckduckgo)
use_wikipedia: Include Wikipedia search
use_academic: Include academic search
"""
# Initialize research tools
self.research = create_research_orchestrator(
search_provider=search_provider,
use_wikipedia=use_wikipedia,
use_academic=use_academic,
)
self.search_tool = SearchTool(provider=search_provider)
self.wikipedia = WikipediaTool() if use_wikipedia else None
self.academic = AcademicSearchTool() if use_academic else None
super().__init__(
role="Research Agent",
description="Live web research with innovation detection",
system_prompt=RESEARCH_AGENT_SYSTEM_PROMPT,
config=config,
)
async def execute(self, input_data: Any, context: dict[str, Any]) -> AgentResponse:
"""Execute research task with live tools.
Args:
input_data: Research query and parameters
context: Additional context
Returns:
Research findings with sources and innovations
"""
# Extract query
if isinstance(input_data, dict):
query = input_data.get("query", "")
subtopics = input_data.get("subtopics", [])
deep = input_data.get("deep_research", False)
else:
query = str(input_data)
subtopics = []
deep = False
if not query:
return AgentResponse(
success=False,
output=None,
error="No research query provided",
metadata={"role": "Research Agent"},
)
try:
# Perform research
if deep or subtopics:
# Deep research with subtopics
results = self.research.deep_research(query, subtopics)
else:
# Quick comprehensive search
results = self.research.comprehensive_search(query)
# Format results for LLM
research_summary = self._format_research_for_llm(results)
# Use LLM to synthesize and provide analysis
synthesis = await self.call_llm(
system_prompt=self.build_system_prompt(context),
user_prompt=f"""Based on this research data, provide analysis and insights:
{research_summary}
Task: {query}
Provide:
1. Key findings synthesized
2. Most important innovations/discoveries
3. How this goes beyond typical training data
4. Recommendations for the manuscript""",
)
return AgentResponse(
success=True,
output={
"raw_results": results,
"synthesis": synthesis,
"query": query,
},
metadata={
"role": "Research Agent",
"search_provider": self.research.search.provider,
},
)
except Exception as e:
return AgentResponse(
success=False,
output=None,
error=f"Research failed: {str(e)}",
metadata={"role": "Research Agent"},
)
def _format_research_for_llm(self, results: dict) -> str:
"""Format research results for LLM consumption."""
output = []
# Query
output.append(f"# Research Query: {results.get('query', '')}")
output.append(f"Timestamp: {results.get('timestamp', '')}")
output.append("")
# Web results
web = results.get("web", [])
if web:
output.append("## Web Search Results")
for i, r in enumerate(web[:5], 1):
output.append(f"{i}. **{r.get('title', '')}**")
output.append(f" URL: {r.get('url', '')}")
output.append(f" {r.get('content', '')[:200]}...")
output.append("")
# Wikipedia
wiki = results.get("wikipedia", [])
if wiki:
output.append("## Wikipedia Results")
for r in wiki[:3]:
output.append(f"- {r.get('title', '')}: {r.get('summary', '')[:200]}...")
output.append("")
# Academic
academic = results.get("academic", [])
if academic:
output.append("## Academic Papers")
for r in academic[:5]:
output.append(f"- {r.get('title', '')} ({r.get('year', 'N/A')})")
output.append(f" {r.get('journal', '')}")
output.append("")
# Innovations
innovations = results.get("innovations", [])
if innovations:
output.append("## Innovations & New Ideas")
for i in innovations:
output.append(f"- {i}")
output.append("")
return "\n".join(output)
# Fact-checking with live verification
class VerifiedFactChecker:
"""Fact checker with live source verification."""
def __init__(self, search_provider: str = "tavily"):
"""Initialize verified fact checker."""
self.search = SearchTool(provider=search_provider)
self.wikipedia = WikipediaTool()
async def verify_claim(
self,
claim: str,
context: str = "",
) -> dict:
"""Verify a factual claim against live sources.
Args:
claim: The claim to verify
context: Additional context
Returns:
Verification result with confidence and sources
"""
# Search for the claim
results = self.search.search(claim, num_results=5)
# Check Wikipedia
wiki_results = self.wikipedia.search(claim, num_results=2)
# Analyze
supporting = []
contradicting = []
neutral = []
for r in results:
content = r.get("content", "").lower()
claim_lower = claim.lower()
# Simple keyword matching
claim_words = set(claim_lower.split())
content_words = set(content.split())
overlap = claim_words & content_words
if len(overlap) > len(claim_words) * 0.7:
supporting.append(r)
elif "not" in content or "false" in content or "incorrect" in content:
contradicting.append(r)
else:
neutral.append(r)
# Calculate confidence
total = len(supporting) + len(contradicting) + len(neutral)
if total == 0:
confidence = 0.0
else:
confidence = len(supporting) / total
return {
"claim": claim,
"verified": len(supporting) > 0,
"confidence": confidence,
"supporting_sources": supporting,
"contradicting_sources": contradicting,
"neutral_sources": neutral,
"needs_citation": confidence < 0.8,
}
async def verify_batch(
self,
claims: list[str],
) -> list[dict]:
"""Verify multiple claims.
Args:
claims: List of claims to verify
Returns:
List of verification results
"""
results = []
for claim in claims:
result = await self.verify_claim(claim)
results.append(result)
return results
def create_research_agent(
search_provider: str = "tavily",
) -> ResearchAgent:
"""Factory to create a research agent.
Args:
search_provider: Search provider
Returns:
Configured ResearchAgent
"""
return ResearchAgent(search_provider=search_provider)
+13
View File
@@ -5,6 +5,13 @@ from opus_orchestrator.utils.github_ingest import GitHubIngestor, create_github_
from opus_orchestrator.utils.s3_ingest import S3Ingestor, create_s3_ingestor
from opus_orchestrator.utils.local_ingest import LocalIngestor, create_local_ingestor
from opus_orchestrator.utils.llm import get_llm_client
from opus_orchestrator.utils.research import (
ResearchOrchestrator,
SearchTool,
WikipediaTool,
AcademicSearchTool,
create_research_orchestrator,
)
__all__ = [
"generate_docs",
@@ -15,4 +22,10 @@ __all__ = [
"LocalIngestor",
"create_local_ingestor",
"get_llm_client",
# Research (NEW!)
"ResearchOrchestrator",
"SearchTool",
"WikipediaTool",
"AcademicSearchTool",
"create_research_orchestrator",
]
+496
View File
@@ -0,0 +1,496 @@
"""Research tools for Opus Orchestrator.
Provides web search, database lookup, and research capabilities.
"""
import os
import json
from typing import Any, Optional, Callable
from datetime import datetime
import requests
from dotenv import load_dotenv
load_dotenv()
class SearchTool:
"""Web search tool using multiple backends."""
def __init__(self, provider: str = "tavily"):
"""Initialize search tool.
Args:
provider: Search provider (tavily, serper, brave, duckduckgo)
"""
self.provider = provider
self._setup_provider()
def _setup_provider(self):
"""Set up the search provider."""
if self.provider == "tavily":
self.api_key = os.environ.get("TAVILY_API_KEY")
elif self.provider == "serper":
self.api_key = os.environ.get("SERPER_API_KEY")
elif self.provider == "brave":
self.api_key = os.environ.get("BRAVE_API_KEY")
def search(
self,
query: str,
num_results: int = 10,
) -> list[dict]:
"""Search the web.
Args:
query: Search query
num_results: Number of results to return
Returns:
List of search results with title, url, snippet
"""
if self.provider == "tavily":
return self._search_tavily(query, num_results)
elif self.provider == "serper":
return self._search_serper(query, num_results)
elif self.provider == "brave":
return self._search_brave(query, num_results)
else:
return self._search_duckduckgo(query, num_results)
def _search_tavily(self, query: str, num_results: int) -> list[dict]:
"""Search using Tavily."""
try:
from tavily import TavilyClient
client = TavilyClient(api_key=self.api_key)
results = client.search(query=query, max_results=num_results)
return [
{
"title": r.get("title", ""),
"url": r.get("url", ""),
"content": r.get("content", ""),
"score": r.get("score", 0),
}
for r in results.get("results", [])
]
except Exception as e:
print(f"Tavily search error: {e}")
return []
def _search_serper(self, query: str, num_results: int) -> list[dict]:
"""Search using Serper."""
try:
headers = {
"X-API-KEY": self.api_key,
"Content-Type": "application/json",
}
payload = {"q": query, "num": num_results}
response = requests.post(
"https://google.serper.dev/search",
headers=headers,
json=payload,
timeout=10,
)
data = response.json()
return [
{
"title": r.get("title", ""),
"url": r.get("link", ""),
"content": r.get("snippet", ""),
"score": 1.0,
}
for r in data.get("organic", [])
]
except Exception as e:
print(f"Serper search error: {e}")
return []
def _search_brave(self, query: str, num_results: int) -> list[dict]:
"""Search using Brave."""
try:
headers = {"Accept": "application/json", "X-Subscription-Token": self.api_key}
response = requests.get(
"https://api.search.brave.com/res/v1/web/search",
params={"q": query, "count": num_results},
headers=headers,
timeout=10,
)
data = response.json()
return [
{
"title": r.get("title", ""),
"url": r.get("url", ""),
"content": r.get("description", ""),
"score": r.get("score", 0),
}
for r in data.get("web", {}).get("results", [])
]
except Exception as e:
print(f"Brave search error: {e}")
return []
def _search_duckduckgo(self, query: str, num_results: int) -> list[dict]:
"""Search using DuckDuckGo (no API key needed)."""
try:
from duckduckgo_search import DDGS
results = DDGS().text(query, max_results=num_results)
return [
{
"title": r.get("title", ""),
"url": r.get("href", ""),
"content": r.get("body", ""),
"score": 1.0,
}
for r in results
]
except Exception as e:
print(f"DuckDuckGo search error: {e}")
return []
class WikipediaTool:
"""Wikipedia lookup tool."""
def __init__(self):
"""Initialize Wikipedia tool."""
pass
def search(self, query: str, num_results: int = 5) -> list[dict]:
"""Search Wikipedia.
Args:
query: Search query
num_results: Number of results
Returns:
List of Wikipedia articles
"""
try:
import wikipedia
results = wikipedia.search(query, results=num_results)
articles = []
for title in results:
try:
page = wikipedia.page(title)
articles.append({
"title": page.title,
"url": page.url,
"summary": page.summary[:500],
"content": page.content[:2000],
})
except:
continue
return articles
except Exception as e:
print(f"Wikipedia search error: {e}")
return []
def get_article(self, title: str) -> dict:
"""Get a Wikipedia article by title.
Args:
title: Article title
Returns:
Article content
"""
try:
import wikipedia
page = wikipedia.page(title)
return {
"title": page.title,
"url": page.url,
"summary": page.summary,
"content": page.content[:5000],
"references": page.references[:10] if hasattr(page, "references") else [],
}
except Exception as e:
return {"error": str(e)}
class ArxivTool:
"""ArXiv paper search tool."""
def __init__(self):
"""Initialize ArXiv tool."""
pass
def search(
self,
query: str,
max_results: int = 10,
categories: list[str] = None,
) -> list[dict]:
"""Search ArXiv for papers.
Args:
query: Search query
max_results: Max results
categories: ArXiv categories to filter
Returns:
List of papers
"""
try:
import arxiv
client = arxiv.Client()
search = arxiv.Search(
query=query,
max_results=max_results,
categories=categories or [],
)
papers = []
for result in client.results(search):
papers.append({
"title": result.title,
"url": result.entry_id,
"abstract": result.summary[:1000],
"authors": [a.name for a in result.authors],
"published": str(result.published.date()),
"categories": result.categories,
})
return papers
except Exception as e:
print(f"ArXiv search error: {e}")
return []
class AcademicSearchTool:
"""Academic paper search (CrossRef, Semantic Scholar)."""
def __init__(self):
"""Initialize academic search tool."""
pass
def search_crossref(self, query: str, max_results: int = 10) -> list[dict]:
"""Search CrossRef for academic papers."""
try:
url = "https://api.crossref.org/works"
params = {"query": query, "rows": max_results}
response = requests.get(url, params=params, timeout=10)
data = response.json()
return [
{
"title": item.get("title", [""])[0],
"url": item.get("URL", ""),
"authors": [a.get("given", "") + " " + a.get("family", "")
for a in item.get("author", [])],
"year": item.get("created", {}).get("date-parts", [[None]])[0][0],
"journal": item.get("container-title", [""])[0],
"doi": item.get("DOI", ""),
}
for item in data.get("message", {}).get("items", [])
]
except Exception as e:
print(f"CrossRef search error: {e}")
return []
def search_semantic_scholar(self, query: str, max_results: int = 10) -> list[dict]:
"""Search Semantic Scholar for papers."""
try:
url = "https://api.semanticscholar.org/graph/v1/paper/search"
params = {
"query": query,
"limit": max_results,
"fields": "title,url,abstract,authors,year,citationCount",
}
response = requests.get(url, params=params, timeout=10)
data = response.json()
return [
{
"title": p.get("title", ""),
"url": p.get("url", ""),
"abstract": p.get("abstract", "")[:500],
"authors": [a.get("name", "") for a in p.get("authors", [])[:5]],
"year": p.get("year"),
"citations": p.get("citationCount", 0),
}
for p in data.get("data", [])
]
except Exception as e:
print(f"Semantic Scholar search error: {e}")
return []
class ResearchOrchestrator:
"""Orchestrates research across multiple tools."""
def __init__(
self,
search_provider: str = "tavily",
use_wikipedia: bool = True,
use_academic: bool = True,
):
"""Initialize research orchestrator.
Args:
search_provider: Search provider to use
use_wikipedia: Include Wikipedia
use_academic: Include academic search
"""
self.search = SearchTool(provider=search_provider)
self.wikipedia = WikipediaTool() if use_wikipedia else None
self.academic = AcademicSearchTool() if use_academic else None
def comprehensive_search(
self,
query: str,
include_web: bool = True,
include_wikipedia: bool = True,
include_academic: bool = True,
) -> dict:
"""Run comprehensive research across all sources.
Args:
query: Research query
include_web: Include web search
include_wikipedia: Include Wikipedia
include_academic: Include academic papers
Returns:
Combined research results
"""
results = {
"query": query,
"timestamp": datetime.now().isoformat(),
"web": [],
"wikipedia": [],
"academic": [],
"innovations": [],
}
# Web search
if include_web:
results["web"] = self.search.search(query, num_results=10)
# Wikipedia
if self.wikipedia and include_wikipedia:
results["wikipedia"] = self.wikipedia.search(query, num_results=5)
# Academic
if self.academic and include_academic:
results["academic"] = self.academic.search_crossref(query, max_results=5)
results["academic"].extend(
self.academic.search_semantic_scholar(query, max_results=5)
)
# Generate innovations from research
results["innovations"] = self._generate_innovations(results)
return results
def _generate_innovations(self, research: dict) -> list[str]:
"""Generate innovative ideas from research.
This analyzes the gathered information to spawn new ideas
and connections beyond the original training data.
Args:
research: Combined research results
Returns:
List of innovative ideas/connections
"""
innovations = []
# Analyze web results for emerging trends
web_content = " ".join([
r.get("content", "")[:200] for r in research.get("web", [])[:5]
])
# Analyze academic for research gaps
academic_titles = [a.get("title", "") for a in research.get("academic", [])[:5]]
# Look for intersections
if web_content and academic_titles:
innovations.append(
"Cross-disciplinary connection: Apply web trends to academic findings"
)
# Add research gaps identification
if len(research.get("academic", [])) < 3:
innovations.append(
"Research gap: Limited academic coverage - original contribution opportunity"
)
# Add timestamp for freshness
innovations.append(
f"Research timestamp: {research.get('timestamp')} - ensures current information"
)
return innovations
def deep_research(
self,
topic: str,
subtopics: list[str] = None,
) -> dict:
"""Perform deep research on a topic and its subtopics.
Args:
topic: Main topic
subtopics: Related subtopics to research
Returns:
Deep research results
"""
results = {
"main_topic": topic,
"main_research": self.comprehensive_search(topic),
"subtopic_research": {},
}
# Research each subtopic
for subtopic in (subtopics or []):
combined = f"{topic}: {subtopic}"
results["subtopic_research"][subtopic] = self.comprehensive_search(combined)
# Cross-reference all findings
results["cross_references"] = self._cross_reference(results)
return results
def _cross_reference(self, deep_results: dict) -> list[str]:
"""Find cross-references between main and subtopic research."""
refs = []
main_content = " ".join([
r.get("content", "")[:300]
for r in deep_results.get("main_research", {}).get("web", [])[:3]
])
for subtopic, sub_data in deep_results.get("subtopic_research", {}).items():
sub_content = " ".join([
r.get("content", "")[:300]
for r in sub_data.get("web", [])[:3]
])
# Look for connections
if main_content and sub_content:
common_words = set(main_content.lower().split()) & set(sub_content.lower().split())
if len(common_words) > 10:
refs.append(f"Connection found: {subtopic} relates to main topic via {len(common_words)} shared concepts")
return refs
def create_research_orchestrator(
search_provider: str = "tavily",
use_wikipedia: bool = True,
use_academic: bool = True,
) -> ResearchOrchestrator:
"""Factory function to create research orchestrator.
Args:
search_provider: Search provider
use_wikipedia: Include Wikipedia
use_academic: Include academic search
Returns:
Configured ResearchOrchestrator
"""
return ResearchOrchestrator(
search_provider=search_provider,
use_wikipedia=use_wikipedia,
use_academic=use_academic,
)
+5
View File
@@ -30,6 +30,11 @@ dependencies = [
"tiktoken>=0.7.0",
"markdown>=3.7",
"python-dotenv>=1.0.0",
# Research dependencies (NEW!)
"tavily>=0.3.0",
"wikipedia>=1.4.0",
"arxiv>=1.4.0",
"duckduckgo-search>=7.0.0",
]
[project.optional-dependencies]