Use CatchAll with LangChain for AI-powered web research
Build autonomous web search agents and research assistants that can find,
analyze, and synthesize information from millions of web pages using natural
language.
CatchAllClient wraps the
CatchAll Python SDK
with LangChain-friendly patterns. Use it for manual control in scripts, data
pipelines, and async applications.
job_id = client.submit_job( query="AI company acquisitions and mergers", context="Focus on deal size and technology sector", schema="[ACQUIRER] acquired [TARGET] for [AMOUNT]",)print(f"Job submitted: {job_id}")
Copy
Ask AI
job_id = await client.submit_job( query="AI company acquisitions and mergers", context="Focus on deal size and technology sector", schema="[ACQUIRER] acquired [TARGET] for [AMOUNT]",)print(f"Job submitted: {job_id}")
# Get first pageresult = client.get_results(job_id, page=1, page_size=100)# Get all pagesresult = client.get_all_results(job_id)for record in result.all_records: print(f"Title: {record.record_title}") print(f"Data: {record.enrichment}") print(f"Sources: {len(record.citations)} articles")
Copy
Ask AI
# Get first pageresult = await client.get_results(job_id, page=1, page_size=100)# Get all pagesresult = await client.get_all_results(job_id)for record in result.all_records: print(f"Title: {record.record_title}") print(f"Data: {record.enrichment}") print(f"Sources: {len(record.citations)} articles")
result = client.search( query="Data breach incidents at financial institutions", context="Include incident type and affected customer count",)print(f"Found {result.valid_records} records")
Copy
Ask AI
result = await client.search( query="Data breach incidents at financial institutions", context="Include incident type and affected customer count",)print(f"Found {result.valid_records} records")
Set wait=False to return immediately without waiting:
Sync
Async
Copy
Ask AI
result = client.search("FDA drug approvals for oncology treatments", wait=False)print(f"Job ID: {result.job_id}")# Retrieve later with client.get_all_results(result.job_id)
Copy
Ask AI
result = await client.search("FDA drug approvals for oncology treatments", wait=False)print(f"Job ID: {result.job_id}")# Retrieve later with await client.get_all_results(result.job_id)
Store job ID for later retrieval (useful for data pipelines):
Copy
Ask AI
import osfrom langchain_catchall import CatchAllClientclient = CatchAllClient(api_key=os.environ["CATCHALL_API_KEY"])# Submit and store job_id for later retrievaljob_id = client.submit_job("Technology company IPO filings")# Store job_id (example using a dict - replace with your database)job_cache = {}job_cache["ipo_tracker"] = job_id# Later: Check if completed and retrievestatus = client.get_status(job_id)completed = any(s.status == 'completed' and s.completed for s in status.steps)if completed: result = client.get_all_results(job_id) print(f"Retrieved {result.valid_records} records from cached job")
Reuse job results without re-running search:
Copy
Ask AI
import osfrom langchain_catchall import CatchAllClient, query_with_llmfrom langchain_openai import ChatOpenAIclient = CatchAllClient(api_key=os.environ["CATCHALL_API_KEY"])llm = ChatOpenAI(model="gpt-4o")# Search onceresult = client.search("Enterprise software company earnings reports")# Query many times (no additional API cost)answer1 = query_with_llm(result, "Which companies reported highest revenue?", llm)answer2 = query_with_llm(result, "Compare year-over-year growth rates", llm)answer3 = query_with_llm(result, "What are the key trends?", llm)
Combine search with LLM analysis:
Copy
Ask AI
import osfrom langchain_catchall import CatchAllClient, query_with_llmfrom langchain_openai import ChatOpenAIclient = CatchAllClient(api_key=os.environ["CATCHALL_API_KEY"])llm = ChatOpenAI(model="gpt-4o")# Submit jobjob_id = client.submit_job("AI startup funding rounds over $10M")client.wait_for_completion(job_id)result = client.get_all_results(job_id)# Analyze with LLManswer = query_with_llm( result=result, question="Summarize top 5 deals by funding amount", llm=llm, max_records=100, # Limit context size for faster analysis)print(answer)
Show Complete example: Async web scraping pipeline
Copy
Ask AI
import asyncioimport osfrom langchain_catchall import AsyncCatchAllClientasync def process_multiple_queries(): """Submit multiple searches concurrently.""" client = AsyncCatchAllClient(api_key=os.environ["CATCHALL_API_KEY"]) queries = [ "Technology company acquisitions and mergers", "Healthcare and biotech company IPO filings", "Retail company bankruptcy filings and restructuring", ] try: # Submit all jobs concurrently job_ids = await asyncio.gather(*[ client.submit_job(query) for query in queries ]) print(f"Submitted {len(job_ids)} jobs") # Wait for all completions await asyncio.gather(*[ client.wait_for_completion(job_id) for job_id in job_ids ]) # Retrieve all results results = await asyncio.gather(*[ client.get_all_results(job_id) for job_id in job_ids ]) # Process results for query, result in zip(queries, results): print(f"\n{query}: {result.valid_records} records") except TimeoutError as e: print(f"One or more jobs timed out: {e}") except Exception as e: print(f"Error processing queries: {e}") raiseif __name__ == "__main__": asyncio.run(process_multiple_queries())
CatchAllTools provides ready-to-use tools for LangGraph agents with built-in
caching. Search once, then analyze many times without additional API costs.
Build an autonomous research agent with LangGraph:
Copy
Ask AI
import osfrom langchain_openai import ChatOpenAIfrom langgraph.prebuilt import create_react_agentfrom langchain.messages import SystemMessagefrom langchain_catchall import CatchAllTools, CATCHALL_AGENT_PROMPT# Initialize componentsllm = ChatOpenAI(model="gpt-4o")toolkit = CatchAllTools( api_key=os.environ["CATCHALL_API_KEY"], llm=llm, verbose=True)tools = toolkit.get_tools()# Create agent with promptagent = create_react_agent(model=llm, tools=tools)messages = [SystemMessage(content=CATCHALL_AGENT_PROMPT)]# Run agentresponse = agent.invoke({ "messages": messages + [("user", "Find technology company acquisitions announced this week")]})print(response["messages"][-1].content)
Show CATCHALL_AGENT_PROMPT content
Copy
Ask AI
CATCHALL_AGENT_PROMPT = """You are a News Research Assistant powered by CatchAll.Your workflow is strictly defined:1. SEARCH: Use `catchall_search_data` to get a broad initial dataset (e.g., 'Find all US office openings'). - WARNING: This tool takes 15 minutes. NEVER call it twice in a row. - After searching, STOP and return what you found. WAIT for the user's next question. - DO NOT automatically analyze or summarize unless explicitly asked.2. ANALYZE: Use `catchall_analyze_data` ONLY when the user asks a follow-up question. - FILTERING & SORTING: 'Show me only Florida deals', 'Sort by date', 'Find top 3'. - AGGREGATION: 'Group by state', 'Count by industry'. - QA: 'What are the main trends?', 'Summarize key findings'.CRITICAL RULES:- After a search completes, report the number of results found and STOP. Wait for user input.- ONLY call analyze_data when the user explicitly asks a follow-up question.- If user says "Find X", just search and report results. If they say "Summarize Y" or "Show me Z", then analyze.- Never use `catchall_search_data` to filter. Always use `catchall_analyze_data` for filtering.- If the user asks for a subset of data (like 'only Florida deals'), assume it is ALREADY in your search results.- Only use `catchall_search_data` if the user explicitly asks for a 'new search' or a completely different topic."""
Show Complete example: Interactive research session
Copy
Ask AI
import osfrom langchain_openai import ChatOpenAIfrom langgraph.prebuilt import create_react_agentfrom langchain.messages import SystemMessagefrom langchain_catchall import CatchAllTools, CATCHALL_AGENT_PROMPTdef run_interactive_agent(): """Run an interactive research agent withconversation history.""" try: # Setup api_key = os.environ.get("CATCHALL_API_KEY") if not api_key: raise ValueError("CATCHALL_API_KEY environment variable not set") llm = ChatOpenAI(model="gpt-4o", temperature=0) toolkit = CatchAllTools(api_key=api_key, llm=llm, verbose=True) tools = toolkit.get_tools() # Create agent agent = create_react_agent(model=llm, tools=tools) messages = [SystemMessage(content=CATCHALL_AGENT_PROMPT)] print("Research Agent Ready!") print("Type 'quit' to exit\n") while True: try: # Get user input user_input = input("You: ").strip() if user_input.lower() == 'quit': break if not user_input: continue # Add user message messages.append(("user", user_input)) # Get agent response response = agent.invoke({"messages": messages}) assistant_message = response["messages"][-1].content # Add assistant message to history messages.append(("assistant", assistant_message)) # Display response print(f"\nAgent: {assistant_message}\n") except KeyboardInterrupt: print("\nExiting...") break except Exception as e: print(f"\nError: {e}") print("Continuing...\n") except Exception as e: print(f"Failed to initialize agent: {e}") raiseif __name__ == "__main__": run_interactive_agent()
Example session:
Copy
Ask AI
You: Find venture capital funding rounds for biotech startupsAgent: I'll search for biotech venture funding articles... [15 minutes later]Found 47 records. Here are the top deals:1. BioTech Corp raised $25M Series B2. GeneTech raised $15M Series A ...You: Show only deals over $20MAgent: [Instantly] Based on the cached results, here are deals over $20M:1. BioTech Corp - $25M Series B2. MedTech Inc - $30M Series C ...You: What's the average funding amount?Agent: [Instantly] Analyzing the data... The average funding amount is $18.5M across all 47 deals.
import osfrom langchain_catchall import CatchAllClientclient = CatchAllClient( api_key=os.environ["CATCHALL_API_KEY"], max_wait_time=2400 # 40 minutes)try: result = client.search("Venture capital funding rounds across all industries") print(f"Success: {result.valid_records} records")except TimeoutError as e: print(f"Search timed out after 30 minutes: {e}") # Retry with narrower query result = client.search("Series B funding rounds for fintech startups")except Exception as e: print(f"Unexpected error: {e}") raise
Monitors automate recurring CatchAll searches with scheduled execution. The
langchain-catchall package does not support Monitors.To use Monitors, install the underlying SDK: