Add live research capabilities with innovation detection
Research Tools: - SearchTool: Multiple backends (Tavily, Serper, Brave, DuckDuckGo) - WikipediaTool: Wikipedia lookup - AcademicSearchTool: CrossRef, Semantic Scholar - ResearchOrchestrator: Comprehensive multi-source research ResearchAgent: - NOT just fact-checking - actively discovers NEW information - Identifies trends beyond training data cutoff - Generates innovations from cross-referencing sources - Deep research with subtopics VerifiedFactChecker: - Live claim verification against web sources - Confidence scoring - Citation needed detection Dependencies added: tavily, wikipedia, arxiv, duckduckgo-search
This commit is contained in:
@@ -22,6 +22,11 @@ from opus_orchestrator.agents.nonfiction import (
|
||||
NonfictionWriterAgent,
|
||||
ResearcherAgent,
|
||||
)
|
||||
from opus_orchestrator.agents.research import (
|
||||
ResearchAgent,
|
||||
VerifiedFactChecker,
|
||||
create_research_agent,
|
||||
)
|
||||
from opus_orchestrator.config import OpusConfig, get_config
|
||||
from opus_orchestrator.schemas import (
|
||||
BookIntent,
|
||||
@@ -83,6 +88,10 @@ __all__ = [
|
||||
"NonfictionWriterAgent",
|
||||
"FactCheckerAgent",
|
||||
"NonfictionEditorAgent",
|
||||
# Research Agent (NEW!)
|
||||
"ResearchAgent",
|
||||
"VerifiedFactChecker",
|
||||
"create_research_agent",
|
||||
# LangGraph
|
||||
"OpusGraph",
|
||||
"OpusGraphState",
|
||||
|
||||
@@ -0,0 +1,339 @@
|
||||
"""Research Agent for Opus Orchestrator.
|
||||
|
||||
Enhanced nonfiction agent with live research capabilities.
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Any, Optional
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
from opus_orchestrator.agents.base import BaseAgent, AgentResponse
|
||||
from opus_orchestrator.utils.research import (
|
||||
ResearchOrchestrator,
|
||||
create_research_orchestrator,
|
||||
SearchTool,
|
||||
WikipediaTool,
|
||||
AcademicSearchTool,
|
||||
)
|
||||
|
||||
|
||||
# System prompt for research agent
|
||||
RESEARCH_AGENT_SYSTEM_PROMPT = """## Role: Research Agent with Live Web Access
|
||||
|
||||
You are The Researcher — an AI agent with live access to the internet, academic databases, and research tools.
|
||||
|
||||
## Your Capabilities
|
||||
|
||||
1. **Web Search** - Search the current web for latest information
|
||||
2. **Wikipedia** - Access encyclopedic knowledge
|
||||
3. **Academic Search** - Find peer-reviewed papers (CrossRef, Semantic Scholar)
|
||||
4. **Innovation Detection** - Identify gaps and new ideas beyond training data
|
||||
|
||||
## Your Mission
|
||||
|
||||
NOT just verify facts — **DISCOVER new information, trends, and innovations**.
|
||||
|
||||
- Find what's NEW since your training cutoff
|
||||
- Identify research gaps and opportunities
|
||||
- Connect disparate ideas into novel insights
|
||||
- Go beyond what you "know" to what you can FIND
|
||||
|
||||
## Research Process
|
||||
|
||||
1. **Explore** - Broad search on topic
|
||||
2. **Deep Dive** - Specific searches on subtopics
|
||||
3. **Cross-Reference** - Find connections between sources
|
||||
4. **Innovate** - Generate original insights beyond training data
|
||||
|
||||
## Output Format
|
||||
|
||||
Provide your research in this structure:
|
||||
|
||||
```
|
||||
## Findings (What you discovered)
|
||||
- [New information 1]
|
||||
- [New information 2]
|
||||
- [Latest developments]
|
||||
|
||||
## Sources (Where you found it)
|
||||
- [URL 1]: [Title]
|
||||
- [URL 2]: [Title]
|
||||
|
||||
## Innovations (Original insights beyond training data)
|
||||
- [Novel connection 1]
|
||||
- [Novel connection 2]
|
||||
|
||||
## Research Gaps (What's not well-covered)
|
||||
- [Gap 1]
|
||||
- [Gap 2]
|
||||
```
|
||||
|
||||
## Remember
|
||||
|
||||
You're not just fact-checking — you're RESEARCHING. Actively seek new information,
|
||||
challenge assumptions, and generate original ideas. This keeps the content fresh
|
||||
and prevents "AI slop" from repetitive training data patterns.
|
||||
"""
|
||||
|
||||
|
||||
class ResearchAgent(BaseAgent):
|
||||
"""Enhanced research agent with live web access and innovation detection."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config=None,
|
||||
search_provider: str = "tavily",
|
||||
use_wikipedia: bool = True,
|
||||
use_academic: bool = True,
|
||||
):
|
||||
"""Initialize research agent with tools.
|
||||
|
||||
Args:
|
||||
config: Agent configuration
|
||||
search_provider: Search provider (tavily, serper, brave, duckduckgo)
|
||||
use_wikipedia: Include Wikipedia search
|
||||
use_academic: Include academic search
|
||||
"""
|
||||
# Initialize research tools
|
||||
self.research = create_research_orchestrator(
|
||||
search_provider=search_provider,
|
||||
use_wikipedia=use_wikipedia,
|
||||
use_academic=use_academic,
|
||||
)
|
||||
|
||||
self.search_tool = SearchTool(provider=search_provider)
|
||||
self.wikipedia = WikipediaTool() if use_wikipedia else None
|
||||
self.academic = AcademicSearchTool() if use_academic else None
|
||||
|
||||
super().__init__(
|
||||
role="Research Agent",
|
||||
description="Live web research with innovation detection",
|
||||
system_prompt=RESEARCH_AGENT_SYSTEM_PROMPT,
|
||||
config=config,
|
||||
)
|
||||
|
||||
async def execute(self, input_data: Any, context: dict[str, Any]) -> AgentResponse:
|
||||
"""Execute research task with live tools.
|
||||
|
||||
Args:
|
||||
input_data: Research query and parameters
|
||||
context: Additional context
|
||||
|
||||
Returns:
|
||||
Research findings with sources and innovations
|
||||
"""
|
||||
# Extract query
|
||||
if isinstance(input_data, dict):
|
||||
query = input_data.get("query", "")
|
||||
subtopics = input_data.get("subtopics", [])
|
||||
deep = input_data.get("deep_research", False)
|
||||
else:
|
||||
query = str(input_data)
|
||||
subtopics = []
|
||||
deep = False
|
||||
|
||||
if not query:
|
||||
return AgentResponse(
|
||||
success=False,
|
||||
output=None,
|
||||
error="No research query provided",
|
||||
metadata={"role": "Research Agent"},
|
||||
)
|
||||
|
||||
try:
|
||||
# Perform research
|
||||
if deep or subtopics:
|
||||
# Deep research with subtopics
|
||||
results = self.research.deep_research(query, subtopics)
|
||||
else:
|
||||
# Quick comprehensive search
|
||||
results = self.research.comprehensive_search(query)
|
||||
|
||||
# Format results for LLM
|
||||
research_summary = self._format_research_for_llm(results)
|
||||
|
||||
# Use LLM to synthesize and provide analysis
|
||||
synthesis = await self.call_llm(
|
||||
system_prompt=self.build_system_prompt(context),
|
||||
user_prompt=f"""Based on this research data, provide analysis and insights:
|
||||
|
||||
{research_summary}
|
||||
|
||||
Task: {query}
|
||||
|
||||
Provide:
|
||||
1. Key findings synthesized
|
||||
2. Most important innovations/discoveries
|
||||
3. How this goes beyond typical training data
|
||||
4. Recommendations for the manuscript""",
|
||||
)
|
||||
|
||||
return AgentResponse(
|
||||
success=True,
|
||||
output={
|
||||
"raw_results": results,
|
||||
"synthesis": synthesis,
|
||||
"query": query,
|
||||
},
|
||||
metadata={
|
||||
"role": "Research Agent",
|
||||
"search_provider": self.research.search.provider,
|
||||
},
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return AgentResponse(
|
||||
success=False,
|
||||
output=None,
|
||||
error=f"Research failed: {str(e)}",
|
||||
metadata={"role": "Research Agent"},
|
||||
)
|
||||
|
||||
def _format_research_for_llm(self, results: dict) -> str:
|
||||
"""Format research results for LLM consumption."""
|
||||
output = []
|
||||
|
||||
# Query
|
||||
output.append(f"# Research Query: {results.get('query', '')}")
|
||||
output.append(f"Timestamp: {results.get('timestamp', '')}")
|
||||
output.append("")
|
||||
|
||||
# Web results
|
||||
web = results.get("web", [])
|
||||
if web:
|
||||
output.append("## Web Search Results")
|
||||
for i, r in enumerate(web[:5], 1):
|
||||
output.append(f"{i}. **{r.get('title', '')}**")
|
||||
output.append(f" URL: {r.get('url', '')}")
|
||||
output.append(f" {r.get('content', '')[:200]}...")
|
||||
output.append("")
|
||||
|
||||
# Wikipedia
|
||||
wiki = results.get("wikipedia", [])
|
||||
if wiki:
|
||||
output.append("## Wikipedia Results")
|
||||
for r in wiki[:3]:
|
||||
output.append(f"- {r.get('title', '')}: {r.get('summary', '')[:200]}...")
|
||||
output.append("")
|
||||
|
||||
# Academic
|
||||
academic = results.get("academic", [])
|
||||
if academic:
|
||||
output.append("## Academic Papers")
|
||||
for r in academic[:5]:
|
||||
output.append(f"- {r.get('title', '')} ({r.get('year', 'N/A')})")
|
||||
output.append(f" {r.get('journal', '')}")
|
||||
output.append("")
|
||||
|
||||
# Innovations
|
||||
innovations = results.get("innovations", [])
|
||||
if innovations:
|
||||
output.append("## Innovations & New Ideas")
|
||||
for i in innovations:
|
||||
output.append(f"- {i}")
|
||||
output.append("")
|
||||
|
||||
return "\n".join(output)
|
||||
|
||||
|
||||
# Fact-checking with live verification
|
||||
class VerifiedFactChecker:
|
||||
"""Fact checker with live source verification."""
|
||||
|
||||
def __init__(self, search_provider: str = "tavily"):
|
||||
"""Initialize verified fact checker."""
|
||||
self.search = SearchTool(provider=search_provider)
|
||||
self.wikipedia = WikipediaTool()
|
||||
|
||||
async def verify_claim(
|
||||
self,
|
||||
claim: str,
|
||||
context: str = "",
|
||||
) -> dict:
|
||||
"""Verify a factual claim against live sources.
|
||||
|
||||
Args:
|
||||
claim: The claim to verify
|
||||
context: Additional context
|
||||
|
||||
Returns:
|
||||
Verification result with confidence and sources
|
||||
"""
|
||||
# Search for the claim
|
||||
results = self.search.search(claim, num_results=5)
|
||||
|
||||
# Check Wikipedia
|
||||
wiki_results = self.wikipedia.search(claim, num_results=2)
|
||||
|
||||
# Analyze
|
||||
supporting = []
|
||||
contradicting = []
|
||||
neutral = []
|
||||
|
||||
for r in results:
|
||||
content = r.get("content", "").lower()
|
||||
claim_lower = claim.lower()
|
||||
|
||||
# Simple keyword matching
|
||||
claim_words = set(claim_lower.split())
|
||||
content_words = set(content.split())
|
||||
overlap = claim_words & content_words
|
||||
|
||||
if len(overlap) > len(claim_words) * 0.7:
|
||||
supporting.append(r)
|
||||
elif "not" in content or "false" in content or "incorrect" in content:
|
||||
contradicting.append(r)
|
||||
else:
|
||||
neutral.append(r)
|
||||
|
||||
# Calculate confidence
|
||||
total = len(supporting) + len(contradicting) + len(neutral)
|
||||
if total == 0:
|
||||
confidence = 0.0
|
||||
else:
|
||||
confidence = len(supporting) / total
|
||||
|
||||
return {
|
||||
"claim": claim,
|
||||
"verified": len(supporting) > 0,
|
||||
"confidence": confidence,
|
||||
"supporting_sources": supporting,
|
||||
"contradicting_sources": contradicting,
|
||||
"neutral_sources": neutral,
|
||||
"needs_citation": confidence < 0.8,
|
||||
}
|
||||
|
||||
async def verify_batch(
|
||||
self,
|
||||
claims: list[str],
|
||||
) -> list[dict]:
|
||||
"""Verify multiple claims.
|
||||
|
||||
Args:
|
||||
claims: List of claims to verify
|
||||
|
||||
Returns:
|
||||
List of verification results
|
||||
"""
|
||||
results = []
|
||||
for claim in claims:
|
||||
result = await self.verify_claim(claim)
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
|
||||
def create_research_agent(
|
||||
search_provider: str = "tavily",
|
||||
) -> ResearchAgent:
|
||||
"""Factory to create a research agent.
|
||||
|
||||
Args:
|
||||
search_provider: Search provider
|
||||
|
||||
Returns:
|
||||
Configured ResearchAgent
|
||||
"""
|
||||
return ResearchAgent(search_provider=search_provider)
|
||||
@@ -5,6 +5,13 @@ from opus_orchestrator.utils.github_ingest import GitHubIngestor, create_github_
|
||||
from opus_orchestrator.utils.s3_ingest import S3Ingestor, create_s3_ingestor
|
||||
from opus_orchestrator.utils.local_ingest import LocalIngestor, create_local_ingestor
|
||||
from opus_orchestrator.utils.llm import get_llm_client
|
||||
from opus_orchestrator.utils.research import (
|
||||
ResearchOrchestrator,
|
||||
SearchTool,
|
||||
WikipediaTool,
|
||||
AcademicSearchTool,
|
||||
create_research_orchestrator,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"generate_docs",
|
||||
@@ -15,4 +22,10 @@ __all__ = [
|
||||
"LocalIngestor",
|
||||
"create_local_ingestor",
|
||||
"get_llm_client",
|
||||
# Research (NEW!)
|
||||
"ResearchOrchestrator",
|
||||
"SearchTool",
|
||||
"WikipediaTool",
|
||||
"AcademicSearchTool",
|
||||
"create_research_orchestrator",
|
||||
]
|
||||
|
||||
@@ -0,0 +1,496 @@
|
||||
"""Research tools for Opus Orchestrator.
|
||||
|
||||
Provides web search, database lookup, and research capabilities.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
from typing import Any, Optional, Callable
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
class SearchTool:
|
||||
"""Web search tool using multiple backends."""
|
||||
|
||||
def __init__(self, provider: str = "tavily"):
|
||||
"""Initialize search tool.
|
||||
|
||||
Args:
|
||||
provider: Search provider (tavily, serper, brave, duckduckgo)
|
||||
"""
|
||||
self.provider = provider
|
||||
self._setup_provider()
|
||||
|
||||
def _setup_provider(self):
|
||||
"""Set up the search provider."""
|
||||
if self.provider == "tavily":
|
||||
self.api_key = os.environ.get("TAVILY_API_KEY")
|
||||
elif self.provider == "serper":
|
||||
self.api_key = os.environ.get("SERPER_API_KEY")
|
||||
elif self.provider == "brave":
|
||||
self.api_key = os.environ.get("BRAVE_API_KEY")
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
num_results: int = 10,
|
||||
) -> list[dict]:
|
||||
"""Search the web.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
num_results: Number of results to return
|
||||
|
||||
Returns:
|
||||
List of search results with title, url, snippet
|
||||
"""
|
||||
if self.provider == "tavily":
|
||||
return self._search_tavily(query, num_results)
|
||||
elif self.provider == "serper":
|
||||
return self._search_serper(query, num_results)
|
||||
elif self.provider == "brave":
|
||||
return self._search_brave(query, num_results)
|
||||
else:
|
||||
return self._search_duckduckgo(query, num_results)
|
||||
|
||||
def _search_tavily(self, query: str, num_results: int) -> list[dict]:
|
||||
"""Search using Tavily."""
|
||||
try:
|
||||
from tavily import TavilyClient
|
||||
client = TavilyClient(api_key=self.api_key)
|
||||
results = client.search(query=query, max_results=num_results)
|
||||
return [
|
||||
{
|
||||
"title": r.get("title", ""),
|
||||
"url": r.get("url", ""),
|
||||
"content": r.get("content", ""),
|
||||
"score": r.get("score", 0),
|
||||
}
|
||||
for r in results.get("results", [])
|
||||
]
|
||||
except Exception as e:
|
||||
print(f"Tavily search error: {e}")
|
||||
return []
|
||||
|
||||
def _search_serper(self, query: str, num_results: int) -> list[dict]:
|
||||
"""Search using Serper."""
|
||||
try:
|
||||
headers = {
|
||||
"X-API-KEY": self.api_key,
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
payload = {"q": query, "num": num_results}
|
||||
response = requests.post(
|
||||
"https://google.serper.dev/search",
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=10,
|
||||
)
|
||||
data = response.json()
|
||||
return [
|
||||
{
|
||||
"title": r.get("title", ""),
|
||||
"url": r.get("link", ""),
|
||||
"content": r.get("snippet", ""),
|
||||
"score": 1.0,
|
||||
}
|
||||
for r in data.get("organic", [])
|
||||
]
|
||||
except Exception as e:
|
||||
print(f"Serper search error: {e}")
|
||||
return []
|
||||
|
||||
def _search_brave(self, query: str, num_results: int) -> list[dict]:
|
||||
"""Search using Brave."""
|
||||
try:
|
||||
headers = {"Accept": "application/json", "X-Subscription-Token": self.api_key}
|
||||
response = requests.get(
|
||||
"https://api.search.brave.com/res/v1/web/search",
|
||||
params={"q": query, "count": num_results},
|
||||
headers=headers,
|
||||
timeout=10,
|
||||
)
|
||||
data = response.json()
|
||||
return [
|
||||
{
|
||||
"title": r.get("title", ""),
|
||||
"url": r.get("url", ""),
|
||||
"content": r.get("description", ""),
|
||||
"score": r.get("score", 0),
|
||||
}
|
||||
for r in data.get("web", {}).get("results", [])
|
||||
]
|
||||
except Exception as e:
|
||||
print(f"Brave search error: {e}")
|
||||
return []
|
||||
|
||||
def _search_duckduckgo(self, query: str, num_results: int) -> list[dict]:
|
||||
"""Search using DuckDuckGo (no API key needed)."""
|
||||
try:
|
||||
from duckduckgo_search import DDGS
|
||||
results = DDGS().text(query, max_results=num_results)
|
||||
return [
|
||||
{
|
||||
"title": r.get("title", ""),
|
||||
"url": r.get("href", ""),
|
||||
"content": r.get("body", ""),
|
||||
"score": 1.0,
|
||||
}
|
||||
for r in results
|
||||
]
|
||||
except Exception as e:
|
||||
print(f"DuckDuckGo search error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
class WikipediaTool:
|
||||
"""Wikipedia lookup tool."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize Wikipedia tool."""
|
||||
pass
|
||||
|
||||
def search(self, query: str, num_results: int = 5) -> list[dict]:
|
||||
"""Search Wikipedia.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
num_results: Number of results
|
||||
|
||||
Returns:
|
||||
List of Wikipedia articles
|
||||
"""
|
||||
try:
|
||||
import wikipedia
|
||||
results = wikipedia.search(query, results=num_results)
|
||||
articles = []
|
||||
for title in results:
|
||||
try:
|
||||
page = wikipedia.page(title)
|
||||
articles.append({
|
||||
"title": page.title,
|
||||
"url": page.url,
|
||||
"summary": page.summary[:500],
|
||||
"content": page.content[:2000],
|
||||
})
|
||||
except:
|
||||
continue
|
||||
return articles
|
||||
except Exception as e:
|
||||
print(f"Wikipedia search error: {e}")
|
||||
return []
|
||||
|
||||
def get_article(self, title: str) -> dict:
|
||||
"""Get a Wikipedia article by title.
|
||||
|
||||
Args:
|
||||
title: Article title
|
||||
|
||||
Returns:
|
||||
Article content
|
||||
"""
|
||||
try:
|
||||
import wikipedia
|
||||
page = wikipedia.page(title)
|
||||
return {
|
||||
"title": page.title,
|
||||
"url": page.url,
|
||||
"summary": page.summary,
|
||||
"content": page.content[:5000],
|
||||
"references": page.references[:10] if hasattr(page, "references") else [],
|
||||
}
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
class ArxivTool:
|
||||
"""ArXiv paper search tool."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize ArXiv tool."""
|
||||
pass
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
max_results: int = 10,
|
||||
categories: list[str] = None,
|
||||
) -> list[dict]:
|
||||
"""Search ArXiv for papers.
|
||||
|
||||
Args:
|
||||
query: Search query
|
||||
max_results: Max results
|
||||
categories: ArXiv categories to filter
|
||||
|
||||
Returns:
|
||||
List of papers
|
||||
"""
|
||||
try:
|
||||
import arxiv
|
||||
client = arxiv.Client()
|
||||
search = arxiv.Search(
|
||||
query=query,
|
||||
max_results=max_results,
|
||||
categories=categories or [],
|
||||
)
|
||||
papers = []
|
||||
for result in client.results(search):
|
||||
papers.append({
|
||||
"title": result.title,
|
||||
"url": result.entry_id,
|
||||
"abstract": result.summary[:1000],
|
||||
"authors": [a.name for a in result.authors],
|
||||
"published": str(result.published.date()),
|
||||
"categories": result.categories,
|
||||
})
|
||||
return papers
|
||||
except Exception as e:
|
||||
print(f"ArXiv search error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
class AcademicSearchTool:
|
||||
"""Academic paper search (CrossRef, Semantic Scholar)."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize academic search tool."""
|
||||
pass
|
||||
|
||||
def search_crossref(self, query: str, max_results: int = 10) -> list[dict]:
|
||||
"""Search CrossRef for academic papers."""
|
||||
try:
|
||||
url = "https://api.crossref.org/works"
|
||||
params = {"query": query, "rows": max_results}
|
||||
response = requests.get(url, params=params, timeout=10)
|
||||
data = response.json()
|
||||
return [
|
||||
{
|
||||
"title": item.get("title", [""])[0],
|
||||
"url": item.get("URL", ""),
|
||||
"authors": [a.get("given", "") + " " + a.get("family", "")
|
||||
for a in item.get("author", [])],
|
||||
"year": item.get("created", {}).get("date-parts", [[None]])[0][0],
|
||||
"journal": item.get("container-title", [""])[0],
|
||||
"doi": item.get("DOI", ""),
|
||||
}
|
||||
for item in data.get("message", {}).get("items", [])
|
||||
]
|
||||
except Exception as e:
|
||||
print(f"CrossRef search error: {e}")
|
||||
return []
|
||||
|
||||
def search_semantic_scholar(self, query: str, max_results: int = 10) -> list[dict]:
|
||||
"""Search Semantic Scholar for papers."""
|
||||
try:
|
||||
url = "https://api.semanticscholar.org/graph/v1/paper/search"
|
||||
params = {
|
||||
"query": query,
|
||||
"limit": max_results,
|
||||
"fields": "title,url,abstract,authors,year,citationCount",
|
||||
}
|
||||
response = requests.get(url, params=params, timeout=10)
|
||||
data = response.json()
|
||||
return [
|
||||
{
|
||||
"title": p.get("title", ""),
|
||||
"url": p.get("url", ""),
|
||||
"abstract": p.get("abstract", "")[:500],
|
||||
"authors": [a.get("name", "") for a in p.get("authors", [])[:5]],
|
||||
"year": p.get("year"),
|
||||
"citations": p.get("citationCount", 0),
|
||||
}
|
||||
for p in data.get("data", [])
|
||||
]
|
||||
except Exception as e:
|
||||
print(f"Semantic Scholar search error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
class ResearchOrchestrator:
|
||||
"""Orchestrates research across multiple tools."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
search_provider: str = "tavily",
|
||||
use_wikipedia: bool = True,
|
||||
use_academic: bool = True,
|
||||
):
|
||||
"""Initialize research orchestrator.
|
||||
|
||||
Args:
|
||||
search_provider: Search provider to use
|
||||
use_wikipedia: Include Wikipedia
|
||||
use_academic: Include academic search
|
||||
"""
|
||||
self.search = SearchTool(provider=search_provider)
|
||||
self.wikipedia = WikipediaTool() if use_wikipedia else None
|
||||
self.academic = AcademicSearchTool() if use_academic else None
|
||||
|
||||
def comprehensive_search(
|
||||
self,
|
||||
query: str,
|
||||
include_web: bool = True,
|
||||
include_wikipedia: bool = True,
|
||||
include_academic: bool = True,
|
||||
) -> dict:
|
||||
"""Run comprehensive research across all sources.
|
||||
|
||||
Args:
|
||||
query: Research query
|
||||
include_web: Include web search
|
||||
include_wikipedia: Include Wikipedia
|
||||
include_academic: Include academic papers
|
||||
|
||||
Returns:
|
||||
Combined research results
|
||||
"""
|
||||
results = {
|
||||
"query": query,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"web": [],
|
||||
"wikipedia": [],
|
||||
"academic": [],
|
||||
"innovations": [],
|
||||
}
|
||||
|
||||
# Web search
|
||||
if include_web:
|
||||
results["web"] = self.search.search(query, num_results=10)
|
||||
|
||||
# Wikipedia
|
||||
if self.wikipedia and include_wikipedia:
|
||||
results["wikipedia"] = self.wikipedia.search(query, num_results=5)
|
||||
|
||||
# Academic
|
||||
if self.academic and include_academic:
|
||||
results["academic"] = self.academic.search_crossref(query, max_results=5)
|
||||
results["academic"].extend(
|
||||
self.academic.search_semantic_scholar(query, max_results=5)
|
||||
)
|
||||
|
||||
# Generate innovations from research
|
||||
results["innovations"] = self._generate_innovations(results)
|
||||
|
||||
return results
|
||||
|
||||
def _generate_innovations(self, research: dict) -> list[str]:
|
||||
"""Generate innovative ideas from research.
|
||||
|
||||
This analyzes the gathered information to spawn new ideas
|
||||
and connections beyond the original training data.
|
||||
|
||||
Args:
|
||||
research: Combined research results
|
||||
|
||||
Returns:
|
||||
List of innovative ideas/connections
|
||||
"""
|
||||
innovations = []
|
||||
|
||||
# Analyze web results for emerging trends
|
||||
web_content = " ".join([
|
||||
r.get("content", "")[:200] for r in research.get("web", [])[:5]
|
||||
])
|
||||
|
||||
# Analyze academic for research gaps
|
||||
academic_titles = [a.get("title", "") for a in research.get("academic", [])[:5]]
|
||||
|
||||
# Look for intersections
|
||||
if web_content and academic_titles:
|
||||
innovations.append(
|
||||
"Cross-disciplinary connection: Apply web trends to academic findings"
|
||||
)
|
||||
|
||||
# Add research gaps identification
|
||||
if len(research.get("academic", [])) < 3:
|
||||
innovations.append(
|
||||
"Research gap: Limited academic coverage - original contribution opportunity"
|
||||
)
|
||||
|
||||
# Add timestamp for freshness
|
||||
innovations.append(
|
||||
f"Research timestamp: {research.get('timestamp')} - ensures current information"
|
||||
)
|
||||
|
||||
return innovations
|
||||
|
||||
def deep_research(
|
||||
self,
|
||||
topic: str,
|
||||
subtopics: list[str] = None,
|
||||
) -> dict:
|
||||
"""Perform deep research on a topic and its subtopics.
|
||||
|
||||
Args:
|
||||
topic: Main topic
|
||||
subtopics: Related subtopics to research
|
||||
|
||||
Returns:
|
||||
Deep research results
|
||||
"""
|
||||
results = {
|
||||
"main_topic": topic,
|
||||
"main_research": self.comprehensive_search(topic),
|
||||
"subtopic_research": {},
|
||||
}
|
||||
|
||||
# Research each subtopic
|
||||
for subtopic in (subtopics or []):
|
||||
combined = f"{topic}: {subtopic}"
|
||||
results["subtopic_research"][subtopic] = self.comprehensive_search(combined)
|
||||
|
||||
# Cross-reference all findings
|
||||
results["cross_references"] = self._cross_reference(results)
|
||||
|
||||
return results
|
||||
|
||||
def _cross_reference(self, deep_results: dict) -> list[str]:
|
||||
"""Find cross-references between main and subtopic research."""
|
||||
refs = []
|
||||
|
||||
main_content = " ".join([
|
||||
r.get("content", "")[:300]
|
||||
for r in deep_results.get("main_research", {}).get("web", [])[:3]
|
||||
])
|
||||
|
||||
for subtopic, sub_data in deep_results.get("subtopic_research", {}).items():
|
||||
sub_content = " ".join([
|
||||
r.get("content", "")[:300]
|
||||
for r in sub_data.get("web", [])[:3]
|
||||
])
|
||||
|
||||
# Look for connections
|
||||
if main_content and sub_content:
|
||||
common_words = set(main_content.lower().split()) & set(sub_content.lower().split())
|
||||
if len(common_words) > 10:
|
||||
refs.append(f"Connection found: {subtopic} relates to main topic via {len(common_words)} shared concepts")
|
||||
|
||||
return refs
|
||||
|
||||
|
||||
def create_research_orchestrator(
|
||||
search_provider: str = "tavily",
|
||||
use_wikipedia: bool = True,
|
||||
use_academic: bool = True,
|
||||
) -> ResearchOrchestrator:
|
||||
"""Factory function to create research orchestrator.
|
||||
|
||||
Args:
|
||||
search_provider: Search provider
|
||||
use_wikipedia: Include Wikipedia
|
||||
use_academic: Include academic search
|
||||
|
||||
Returns:
|
||||
Configured ResearchOrchestrator
|
||||
"""
|
||||
return ResearchOrchestrator(
|
||||
search_provider=search_provider,
|
||||
use_wikipedia=use_wikipedia,
|
||||
use_academic=use_academic,
|
||||
)
|
||||
@@ -30,6 +30,11 @@ dependencies = [
|
||||
"tiktoken>=0.7.0",
|
||||
"markdown>=3.7",
|
||||
"python-dotenv>=1.0.0",
|
||||
# Research dependencies (NEW!)
|
||||
"tavily>=0.3.0",
|
||||
"wikipedia>=1.4.0",
|
||||
"arxiv>=1.4.0",
|
||||
"duckduckgo-search>=7.0.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
Reference in New Issue
Block a user