[{"title":"Live Web Data Access Reduces Agent Hallucinations by 65%","description":"Real-time web data integration cuts agent hallucination rates by 35%, establishing live data as essential for production agents.","section":"News","collection":"news","path":"/news/live-web-data-access-reduces-agent-hallucinations-by-65/","date":"2026-05-09T08:00:48.192Z","tags":["tool-use","eval","observability"],"entities":["Agents (aggregate)"],"audience":"builder","technical_depth":"intermediate","agent_type":"browser","stack_layer":"tool-use","maturity":"production"},{"title":"Agentic AI Shift Tops 2026 Stories Over Models","description":"Experts declare move from models to full agent systems as year's biggest AI development.","section":"News","collection":"news","path":"/news/agentic-ai-shift-tops-2026-stories-over-models/","date":"2026-05-08T07:46:42.318Z","tags":["trend","systems"],"entities":["PRWeek"],"audience":"builder","technical_depth":"intermediate","agent_type":"multi-agent","stack_layer":"orchestration","maturity":"production"},{"title":"Pentagon Cuts Anthropic Ties Over Agent Terms","description":"DOD dispute with Anthropic prompts new AI deals with Nvidia, MSFT, AWS for classified agents.","section":"News","collection":"news","path":"/news/pentagon-cuts-anthropic-ties-over-agent-terms/","date":"2026-05-08T07:46:42.318Z","tags":["policy","military","vendor"],"entities":["Pentagon","Anthropic","Nvidia","Microsoft","AWS"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"safety","maturity":"production"},{"title":"pydantic-ai 1.92.0 released","description":"Pydantic AI 1.92.0 introduces Anthropic task budget support and runtime `output_retries` override with deprecation of the old `retries` field, enhancing control over AI agent execution and reliability. It also fixes key bugs like streaming response cleanup on cancellation, MCP session task isolation to prevent exit scope errors, and proper population of `RunContext` with run/conversation IDs and metadata.","section":"News","collection":"news","path":"/news/pydantic-ai-1-92-0-release/","date":"2026-05-08T06:47:27.832Z","tags":["pydantic-ai","releases"],"entities":["pydantic-ai"],"audience":"builder","technical_depth":"intermediate","stack_layer":"framework"},{"title":"Agentic Stories Podcast Covers Governance News","description":"Daily briefing launches on AI agent economy, emphasizing governance, security, and deployment challenges.","section":"News","collection":"news","path":"/news/agentic-stories-podcast-covers-governance-news/","date":"2026-05-08T06:47:23.963Z","tags":["governance","observability"],"entities":["Agentic Stories"],"audience":"builder","technical_depth":"intermediate","agent_type":"multi-agent","stack_layer":"observability","maturity":"production"},{"title":"Anthropic-Pentagon Stalemate on Claude Usage","description":"Anthropic and DoD reach impasse over deploying Claude model in defense applications amid policy concerns.","section":"News","collection":"news","path":"/news/anthropic-pentagon-stalemate-on-claude-usage/","date":"2026-05-08T06:47:23.963Z","tags":["policy","government"],"entities":["Anthropic","Pentagon"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"safety","maturity":"production"},{"title":"Banking Vet Launches Enterprise Primitive AI Agents","description":"Former banking executive unveils enterprise-grade system for primitive AI agents targeting business automation.","section":"News","collection":"news","path":"/news/banking-vet-launches-enterprise-primitive-ai-agents/","date":"2026-05-08T06:47:23.963Z","tags":["funding","enterprise"],"entities":["Banking Veteran"],"audience":"builder","technical_depth":"intermediate","agent_type":"rpa","stack_layer":"orchestration","maturity":"production"},{"title":"AWS Launches AgentCore Payments — Agents Can Now Transact with Coinbase and Stripe","description":"Amazon Bedrock AgentCore now lets autonomous agents make payments via stablecoin micropayments, built with Coinbase x402 and Stripe Privy wallet infrastructure.","section":"News","collection":"news","path":"/news/aws-agentcore-payments-coinbase-stripe/","date":"2026-05-07T21:33:00.000Z","tags":["payments","infrastructure","multi-agent"],"entities":["AWS","Amazon Bedrock","Coinbase","Stripe","Privy"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"orchestration","maturity":"beta"},{"title":"Claude API Adds Streaming for High-Throughput Agents","description":"New streaming and batching endpoints in Claude API optimize for agentic deployments requiring real-time processing.","section":"News","collection":"news","path":"/news/claude-api-adds-streaming-for-high-throughput-agents/","date":"2026-05-07T20:04:22.225Z","tags":["tool-use","observability"],"entities":["Anthropic"],"audience":"builder","technical_depth":"intermediate","agent_type":"workflow","stack_layer":"tool-use","maturity":"production"},{"title":"MCP Agent Framework Hits 97M Installs Milestone","description":"March 25 stats reveal MCP, a key agentic infrastructure standard, reached 97 million installs, transforming agent development.","section":"News","collection":"news","path":"/news/mcp-agent-framework-hits-97m-installs-milestone/","date":"2026-05-07T20:04:22.225Z","tags":["agent frameworks","adoption"],"entities":["MCP"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"framework","maturity":"production"},{"title":"Mistral Small 4 Tops Reasoning Benchmarks for Agent Use","description":"22B-parameter Mistral Small 4 outperforms larger closed models on reasoning and instruction benchmarks critical for agents.","section":"News","collection":"news","path":"/news/mistral-small-4-tops-reasoning-benchmarks-for-agent-use/","date":"2026-05-07T20:04:22.225Z","tags":["model releases","tool-use"],"entities":["Mistral"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"model","maturity":"production"},{"title":"NVIDIA GTC Confirms Enterprise Agentic Production Deployments","description":"NVIDIA's GTC 2026 showcased Fortune 500 companies running agentic AI systems in production using NeMoCLAW and OpenCLAW frameworks.","section":"News","collection":"news","path":"/news/nvidia-gtc-confirms-enterprise-agentic-production-deployment/","date":"2026-05-07T20:04:22.225Z","tags":["agent frameworks","enterprise"],"entities":["NVIDIA","NeMoCLAW","OpenCLAW"],"audience":"builder","technical_depth":"intermediate","agent_type":"multi-agent","stack_layer":"orchestration","maturity":"production"},{"title":"OpenCLAW Released as Open-Source Agent Orchestration Framework","description":"Apache 2.0-licensed OpenCLAW launches as companion to NVIDIA's NeMoCLAW for enterprise multi-agent systems.","section":"News","collection":"news","path":"/news/openclaw-released-as-open-source-agent-orchestration-framewo/","date":"2026-05-07T20:04:22.225Z","tags":["agent frameworks","open source"],"entities":["OpenCLAW","NVIDIA"],"audience":"builder","technical_depth":"intermediate","agent_type":"multi-agent","stack_layer":"orchestration","maturity":"production"},{"title":"Five Eyes Warns on Agentic AI Risks","description":"Security agencies urge caution in deploying autonomous AI agents across business systems.","section":"News","collection":"news","path":"/news/five-eyes-warns-on-agentic-ai-risks/","date":"2026-05-07T19:14:14.997Z","tags":["agent policy","safety"],"entities":["Five Eyes"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"safety","maturity":"production"},{"title":"HPE Deploys Autonomous Networking Agents","description":"Self-driving agents optimize enterprise networks and cut tickets by 75%.","section":"News","collection":"news","path":"/news/hpe-deploys-autonomous-networking-agents/","date":"2026-05-07T19:14:14.997Z","tags":["workflow","rpa"],"entities":["HPE"],"audience":"builder","technical_depth":"intermediate","agent_type":"workflow","stack_layer":"orchestration","maturity":"production"},{"title":"Palo Alto Acquires Portkey for Agent Security","description":"Portkey's gateway protects autonomous agents processing trillions of tokens.","section":"News","collection":"news","path":"/news/palo-alto-acquires-portkey-for-agent-security/","date":"2026-05-07T19:14:14.997Z","tags":["observability","safety"],"entities":["Palo Alto Networks","Portkey"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"observability","maturity":"production"},{"title":"UiPath Adds Agentic Automation to Self-Hosted Suite","description":"Agentic AI now available for on-prem environments in regulated sectors.","section":"News","collection":"news","path":"/news/uipath-adds-agentic-automation-to-self-hosted-suite/","date":"2026-05-07T19:14:14.997Z","tags":["agent frameworks","rpa"],"entities":["UiPath"],"audience":"builder","technical_depth":"intermediate","agent_type":"rpa","stack_layer":"framework","maturity":"production"},{"title":"Clawdbot Open-Source Agent Drives Mac Mini Hardware Shortage","description":"An open-source version of OpenClaw called Clawdbot went viral, causing Apple Mac Minis to sell out as users rushed to purchase always-on hardware for local agent deployment.","section":"News","collection":"news","path":"/news/clawdbot-open-source-agent-drives-mac-mini-hardware-shortage/","date":"2026-05-07T18:52:44.972Z","tags":["open-source","agent-deployment","hardware","privacy"],"entities":["Clawdbot","Apple","Mac Mini"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"distribution","maturity":"production"},{"title":"ServiceTrade Unveils Stella AI Agents for Field Service","description":"ServiceTrade launches Stella suite with Quote and Schedule agents to automate field operations.","section":"News","collection":"news","path":"/news/servicetrade-unveils-stella-ai-agents-for-field-service/","date":"2026-05-07T18:29:21.157Z","tags":["field-service","automation"],"entities":["ServiceTrade"],"audience":"builder","technical_depth":"intermediate","agent_type":"rpa","stack_layer":"orchestration","maturity":"production"},{"title":"CORAS.ai Ships Agentic Reporting for Defense, Replaces BI Tools","description":"CORAS.ai launches agentic AI reporting platform on May 5, consolidating defense BI systems into one IL5 tool.","section":"News","collection":"news","path":"/news/coras-ai-ships-agentic-reporting-for-defense-replaces-bi-too/","date":"2026-05-07T08:38:53.374Z","tags":["infrastructure","defense"],"entities":["CORAS.ai"],"audience":"builder","technical_depth":"intermediate","agent_type":"workflow","stack_layer":"orchestration","maturity":"production"},{"title":"Anthropic Secures xAI's Colossus-1 Compute in Surprise Cross-Rival Deal","description":"Anthropic has signed an agreement with SpaceX to access all 300MW of compute capacity at xAI's Colossus 1 data centre in Memphis, immediately raising usage limits for Claude Pro, Max, and API subscribers.","section":"News","collection":"news","path":"/news/anthropic-spacex-xai-colossus-compute-deal/","date":"2026-05-06T16:41:40.000Z","tags":["anthropic","xai","spacex","compute","infrastructure","claude"],"entities":["Anthropic","xAI","SpaceX","Elon Musk","Dario Amodei"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"model"},{"title":"autogen python-v0.7.5 released","description":"AutoGen v0.7.5 adds linear memory support in RedisMemory, enabling more scalable and efficient long‑running agent conversations. It also introduces thinking mode for the Anthropic client and fixes several streaming, tool‑call, and correlation issues that improve reliability and performance for agent builders.","section":"News","collection":"news","path":"/news/autogen-python-v0-7-5-release/","date":"2026-05-06T10:05:50.311Z","tags":["autogen","releases"],"entities":["autogen"],"audience":"builder","technical_depth":"intermediate","stack_layer":"framework"},{"title":"crewai 1.14.4 released","description":"CrewAI 1.14.4 introduces enhanced cloud provider support with custom persistence keys for @persist, Responses API for Azure OpenAI, and new search/research tools via Tavily and You.com MCP integration. The release also includes critical bug fixes for JSON parsing, tool call preservation, and multimodal input handling, improving reliability for production agent deployments.","section":"News","collection":"news","path":"/news/crewai-1-14-4-release/","date":"2026-05-06T10:05:50.311Z","tags":["crewai","releases"],"entities":["crewai"],"audience":"builder","technical_depth":"intermediate","stack_layer":"framework"},{"title":"langgraph sdk==0.3.14 released","description":"LangGraph SDK 0.3.14 introduces a `return_minimal` parameter for threads update operations, enabling more efficient API responses for AI agent builders. The release also includes streaming transformer infrastructure and support for `stream_events(version='v3')` on Pregel, providing enhanced control over event streaming in agent workflows.","section":"News","collection":"news","path":"/news/langgraph-sdk-0-3-14-release/","date":"2026-05-06T10:05:50.311Z","tags":["langgraph","releases"],"entities":["langgraph"],"audience":"builder","technical_depth":"intermediate","stack_layer":"framework"},{"title":"letta 0.16.7 released","description":"Letta 0.16.7 raises the default global context window from 32k to 128k and fixes the context window reset bug, with a completely overhauled compaction system that eliminates most manual configuration workarounds for self-hosted users. Block limits are no longer enforced, allowing blocks to grow freely, though users must now manage block size through alternative means if they were previously relying on limits to control per-turn costs.","section":"News","collection":"news","path":"/news/letta-0-16-7-release/","date":"2026-05-06T10:05:50.311Z","tags":["letta","releases"],"entities":["letta"],"audience":"builder","technical_depth":"intermediate","stack_layer":"framework"},{"title":"Anthropic Zero-Day Flaw Exposes 200K AI Agent Servers","description":"Critical vulnerability in Anthropic's Model Context Protocol triggers $25B security overhaul with Amazon.","section":"News","collection":"news","path":"/news/anthropic-zero-day-flaw-exposes-200k-ai-agent-servers/","date":"2026-05-05T22:09:04.592Z","tags":["security","vulnerability","infrastructure"],"entities":["Anthropic","Amazon"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"safety","maturity":"production"},{"title":"NVIDIA Launches Nemotron 3 Nano Omni Unified Agent Model","description":"NVIDIA releases Nemotron 3 Nano Omni, unifying vision, audio, and language for faster AI agent processing.","section":"News","collection":"news","path":"/news/nvidia-launches-nemotron-3-nano-omni-unified-agent-model/","date":"2026-05-05T22:09:04.592Z","tags":["model-release","multi-modal","agents"],"entities":["NVIDIA","Nemotron 3 Nano Omni"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"model","maturity":"production"},{"title":"Anthropic moves Computer Use out of beta, ships native sandbox primitive","description":"Claude's screen-grounded agent loop graduates with new tool-use primitives, an isolated sandbox, and tighter rate-limit policy for production deployments.","section":"News","collection":"news","path":"/news/anthropic-computer-use-ga/","date":"2026-04-22T09:30:00.000Z","tags":["anthropic","computer-use","browser-agents","sandbox"],"entities":["Anthropic","Claude"],"audience":"builder","technical_depth":"intermediate","agent_type":"browser","stack_layer":"tool-use","maturity":"production"},{"title":"Arize Phoenix","description":"OpenTelemetry-native LLM observability and evaluation.","section":"Tools","collection":"tools","path":"/tools/arize-phoenix/","date":"2026-04-20T00:00:00.000Z","tags":["observability","otel"],"entities":["Arize AI"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"observability","maturity":"production"},{"title":"AutoGen","description":"Conversational multi-agent framework with strong reasoning patterns.","section":"Tools","collection":"tools","path":"/tools/autogen/","date":"2026-04-20T00:00:00.000Z","tags":["multi-agent","conversation"],"entities":["Microsoft Research"],"audience":"builder","technical_depth":"intermediate","agent_type":"multi-agent","stack_layer":"orchestration","maturity":"production"},{"title":"Browserbase","description":"Hosted, isolated browsers for agent automation with session replay.","section":"Tools","collection":"tools","path":"/tools/browserbase/","date":"2026-04-20T00:00:00.000Z","tags":["browser","sandbox","hosted"],"entities":["Browserbase"],"audience":"builder","technical_depth":"intermediate","agent_type":"browser","stack_layer":"sandbox","maturity":"production"},{"title":"Continue","description":"Open-source coding-agent IDE extension for VS Code and JetBrains.","section":"Tools","collection":"tools","path":"/tools/continue-dev/","date":"2026-04-20T00:00:00.000Z","tags":["coding","ide"],"entities":["Continue"],"audience":"builder","technical_depth":"intermediate","agent_type":"coding","stack_layer":"distribution","maturity":"production"},{"title":"CrewAI","description":"Role-based multi-agent framework with declarative crew definitions.","section":"Tools","collection":"tools","path":"/tools/crewai/","date":"2026-04-20T00:00:00.000Z","tags":["multi-agent","python"],"entities":["CrewAI"],"audience":"builder","technical_depth":"intermediate","agent_type":"multi-agent","stack_layer":"orchestration","maturity":"production"},{"title":"E2B","description":"Cloud sandboxes for code-running AI agents.","section":"Tools","collection":"tools","path":"/tools/e2b/","date":"2026-04-20T00:00:00.000Z","tags":["sandbox","code-execution"],"entities":["E2B"],"audience":"builder","technical_depth":"intermediate","agent_type":"coding","stack_layer":"sandbox","maturity":"production"},{"title":"Haystack","description":"Pipelines for retrieval-heavy agent workloads.","section":"Tools","collection":"tools","path":"/tools/haystack-agents/","date":"2026-04-20T00:00:00.000Z","tags":["rag","retrieval"],"entities":["deepset"],"audience":"builder","technical_depth":"intermediate","agent_type":"retrieval","stack_layer":"framework","maturity":"mature"},{"title":"Helicone","description":"Lightweight LLM observability with a proxy-first model.","section":"Tools","collection":"tools","path":"/tools/helicone/","date":"2026-04-20T00:00:00.000Z","tags":["observability","proxy"],"entities":["Helicone"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"observability","maturity":"production"},{"title":"Inngest Agent Kit","description":"Durable workflows and step functions for agents.","section":"Tools","collection":"tools","path":"/tools/inngest-agent/","date":"2026-04-20T00:00:00.000Z","tags":["workflow","durable","typescript"],"entities":["Inngest"],"audience":"builder","technical_depth":"intermediate","agent_type":"workflow","stack_layer":"orchestration","maturity":"production"},{"title":"Langfuse","description":"Open-source observability for LLM and agent applications.","section":"Tools","collection":"tools","path":"/tools/langfuse/","date":"2026-04-20T00:00:00.000Z","tags":["observability","tracing"],"entities":["Langfuse"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"observability","maturity":"production"},{"title":"LangGraph","description":"Stateful, graph-based orchestration for LLM workflows with deterministic checkpoints.","section":"Tools","collection":"tools","path":"/tools/langgraph/","date":"2026-04-20T00:00:00.000Z","tags":["orchestration","python","graph"],"entities":["LangChain"],"audience":"builder","technical_depth":"intermediate","agent_type":"multi-agent","stack_layer":"orchestration","maturity":"production"},{"title":"Letta (formerly MemGPT)","description":"Long-term memory primitive: hierarchical context with explicit recall calls.","section":"Tools","collection":"tools","path":"/tools/letta/","date":"2026-04-20T00:00:00.000Z","tags":["memory","long-horizon"],"entities":["Letta"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"memory","maturity":"beta"},{"title":"Lindy","description":"No-code agent builder for business operations workflows.","section":"Tools","collection":"tools","path":"/tools/lindy/","date":"2026-04-20T00:00:00.000Z","tags":["no-code","operations"],"entities":["Lindy"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"distribution","maturity":"production"},{"title":"MCP Toolbox","description":"Reference servers and clients for the Model Context Protocol.","section":"Tools","collection":"tools","path":"/tools/mcp-toolbox/","date":"2026-04-20T00:00:00.000Z","tags":["mcp","tool-use","protocol"],"entities":["Model Context Protocol"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"tool-use","maturity":"beta"},{"title":"Mistral Agents API","description":"Hosted agent runtime with native function calling and code execution.","section":"Tools","collection":"tools","path":"/tools/mistral-agents/","date":"2026-04-20T00:00:00.000Z","tags":["hosted","european"],"entities":["Mistral"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"framework","maturity":"beta"},{"title":"Modal","description":"Serverless infra for agent workloads — sandboxes, GPUs, schedules.","section":"Tools","collection":"tools","path":"/tools/modal-agents/","date":"2026-04-20T00:00:00.000Z","tags":["serverless","sandbox","gpu"],"entities":["Modal Labs"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"sandbox","maturity":"production"},{"title":"OpenPipe","description":"Distill production agent traffic into smaller fine-tuned models.","section":"Tools","collection":"tools","path":"/tools/openpipe/","date":"2026-04-20T00:00:00.000Z","tags":["fine-tune","distillation"],"entities":["OpenPipe"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"model","maturity":"production"},{"title":"PydanticAI","description":"Type-safe agent framework from the team behind Pydantic.","section":"Tools","collection":"tools","path":"/tools/pydantic-ai/","date":"2026-04-20T00:00:00.000Z","tags":["type-safe","python"],"entities":["Pydantic"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"framework","maturity":"beta"},{"title":"Temporal","description":"Durable workflow engine increasingly used for long-running agents.","section":"Tools","collection":"tools","path":"/tools/temporal-agents/","date":"2026-04-20T00:00:00.000Z","tags":["durable","workflow"],"entities":["Temporal"],"audience":"builder","technical_depth":"intermediate","agent_type":"workflow","stack_layer":"orchestration","maturity":"mature"},{"title":"Weights & Biases Weave","description":"Tracing, evals, and experiment tracking unified.","section":"Tools","collection":"tools","path":"/tools/weights-and-traces/","date":"2026-04-20T00:00:00.000Z","tags":["observability","eval"],"entities":["Weights & Biases"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"observability","maturity":"production"},{"title":"OpenAI ships Swarm 2 with built-in handoff tracing and per-agent budgets","description":"Swarm 2 introduces a structured handoff log, hard token budgets per agent, and an interoperability shim for LangGraph and CrewAI.","section":"News","collection":"news","path":"/news/openai-swarm-2-multi-agent/","date":"2026-04-19T16:05:00.000Z","tags":["openai","multi-agent","orchestration","tracing"],"entities":["OpenAI","Swarm","LangGraph","CrewAI"],"audience":"builder","technical_depth":"intermediate","agent_type":"multi-agent","stack_layer":"orchestration","maturity":"beta"},{"title":"Reflexion, three years on: what self-critique still buys you","description":"A meta-analysis of 41 papers building on Reflexion-style self-critique loops finds modest, durable gains in coding and tool-use, and diminishing returns in open-ended reasoning.","section":"Research","collection":"research","path":"/research/reflexion-revisited/","date":"2026-04-18T10:00:00.000Z","tags":["self-critique","reflexion","meta-analysis"],"entities":["Northeastern","Reflexion"],"audience":"researcher","technical_depth":"deep","agent_type":"general","stack_layer":"orchestration","maturity":"mature"},{"title":"Google opens Gemini Agent SDK with first-party MCP server registry","description":"The Agent SDK ships with a curated MCP registry, native long-running task support, and managed memory tied to Vertex AI.","section":"News","collection":"news","path":"/news/google-gemini-agent-sdk/","date":"2026-04-15T11:00:00.000Z","tags":["google","gemini","mcp","sdk"],"entities":["Google","Gemini","Vertex AI","MCP"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"framework","maturity":"beta"},{"title":"Long-horizon memory: survey of seven architectures, ranked by recall and cost","description":"Compares episodic, semantic, hybrid, and graph-based memory across realistic 30-day agent simulations. Hybrid stores win on recall; graph stores win on cost stability.","section":"Research","collection":"research","path":"/research/long-horizon-memory-survey/","date":"2026-04-14T09:30:00.000Z","tags":["memory","long-horizon","survey"],"entities":["Stanford","MemGPT","Letta"],"audience":"builder","technical_depth":"deep","agent_type":"general","stack_layer":"memory","maturity":"mature"},{"title":"SWE-bench Verified hits 78%, prompting calls for a harder coding eval","description":"Top coding agents now resolve more than three of every four tasks in SWE-bench Verified, reigniting debate over whether the benchmark still discriminates between systems.","section":"News","collection":"news","path":"/news/swe-bench-verified-saturated/","date":"2026-04-12T08:00:00.000Z","tags":["benchmarks","evaluation","coding-agents"],"entities":["SWE-bench","Princeton"],"audience":"researcher","technical_depth":"deep","agent_type":"coding","stack_layer":"eval","maturity":"mature"},{"title":"Add long-term memory with Letta","description":"Wire a hierarchical memory store into an existing agent and audit what it remembers.","section":"Build","collection":"build","path":"/build/memory-with-letta/","date":"2026-04-12T00:00:00.000Z","tags":["memory","letta","tutorial"],"entities":[],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"memory","maturity":"beta"},{"title":"Build a replay-based eval set in a weekend","description":"How to capture, redact, and score real production sessions to evaluate agent candidates.","section":"Build","collection":"build","path":"/build/evals-replay-set/","date":"2026-04-12T00:00:00.000Z","tags":["eval","replay","production"],"entities":[],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"eval","maturity":"production"},{"title":"Build your first production agent with LangGraph","description":"A 90-minute walkthrough that ships a tool-using agent with persistent state, retries, and observability.","section":"Build","collection":"build","path":"/build/first-agent-langgraph/","date":"2026-04-12T00:00:00.000Z","tags":["langgraph","python","tutorial"],"entities":[],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"orchestration","maturity":"production"},{"title":"Cost controls for agent workloads","description":"Token budgets, fallback tiers, and the dashboards that catch runaway runs before they hurt.","section":"Build","collection":"build","path":"/build/cost-controls/","date":"2026-04-12T00:00:00.000Z","tags":["cost","finops"],"entities":[],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"observability","maturity":"production"},{"title":"Ship a browser agent on Browserbase","description":"A production browser-agent stack with anti-bot resilience, session replay, and a kill switch.","section":"Build","collection":"build","path":"/build/browser-agent-on-browserbase/","date":"2026-04-12T00:00:00.000Z","tags":["browser","browserbase","tutorial"],"entities":[],"audience":"builder","technical_depth":"intermediate","agent_type":"browser","stack_layer":"sandbox","maturity":"production"},{"title":"Customer support: agent-led deflection at the contact moment","description":"How leading B2C teams are reducing tier-1 ticket volume by 35-55% with a tightly-scoped support agent.","section":"Use Cases","collection":"use-cases","path":"/use-cases/customer-support-deflection/","date":"2026-04-10T00:00:00.000Z","tags":["support","b2c","deflection"],"entities":[],"audience":"operator","technical_depth":"intermediate","agent_type":"general","stack_layer":"distribution","maturity":"production"},{"title":"Engineering: internal-tool agents over the API graph","description":"How platform teams replace one-off internal dashboards with a shared agent over their API graph.","section":"Use Cases","collection":"use-cases","path":"/use-cases/developer-internal-tools/","date":"2026-04-10T00:00:00.000Z","tags":["platform","devtools"],"entities":[],"audience":"operator","technical_depth":"intermediate","agent_type":"general","stack_layer":"tool-use","maturity":"production"},{"title":"Legal: contract redlining assistant","description":"A focused agent flags deviations from a playbook and proposes redlines for a human to approve.","section":"Use Cases","collection":"use-cases","path":"/use-cases/legal-redlining/","date":"2026-04-10T00:00:00.000Z","tags":["legal","contracts"],"entities":[],"audience":"operator","technical_depth":"intermediate","agent_type":"general","stack_layer":"distribution","maturity":"production"},{"title":"Sales: pre-call research and account briefs","description":"A research agent assembles a 1-page brief 30 minutes before every external call.","section":"Use Cases","collection":"use-cases","path":"/use-cases/sales-research-prep/","date":"2026-04-10T00:00:00.000Z","tags":["sales","research","b2b"],"entities":[],"audience":"operator","technical_depth":"intermediate","agent_type":"research","stack_layer":"distribution","maturity":"production"},{"title":"Security: alert triage and enrichment","description":"An agent enriches and triages SOC alerts, halving the load on tier-1 analysts.","section":"Use Cases","collection":"use-cases","path":"/use-cases/security-triage/","date":"2026-04-10T00:00:00.000Z","tags":["security","soc","triage"],"entities":[],"audience":"operator","technical_depth":"intermediate","agent_type":"workflow","stack_layer":"distribution","maturity":"beta"},{"title":"EU AI Office issues draft guidance on autonomous agent disclosures","description":"The draft requires clear disclosure when agents act on a user's behalf in regulated transactions, plus an audit log requirement for high-risk deployments.","section":"News","collection":"news","path":"/news/eu-ai-act-agent-guidance/","date":"2026-04-09T14:25:00.000Z","tags":["regulation","eu-ai-act","governance","compliance"],"entities":["European Union","EU AI Office"],"audience":"executive","technical_depth":"intro","agent_type":"general","stack_layer":"safety","maturity":"production"},{"title":"Six failure modes in tool-using agents, and the patterns that fix them","description":"An empirical taxonomy of agent tool-use failures across 4,000 traces from production deployments. Schema drift and silent partial-failure dominate.","section":"Research","collection":"research","path":"/research/tool-use-failure-modes/","date":"2026-04-08T13:15:00.000Z","tags":["tool-use","failure-modes","production"],"entities":["DeepMind"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"tool-use","maturity":"production"},{"title":"AutoGen vs. LangGraph","description":"Conversation-shaped runtimes vs. graph-shaped runtimes.","section":"Comparisons","collection":"comparisons","path":"/comparisons/autogen-vs-langgraph/","date":"2026-04-08T00:00:00.000Z","tags":["comparison"],"entities":["AutoGen","LangGraph"],"audience":"builder","technical_depth":"intermediate"},{"title":"Browserbase vs. self-hosted Playwright","description":"Hosted browser infra vs. rolling your own. The total-cost picture.","section":"Comparisons","collection":"comparisons","path":"/comparisons/browserbase-vs-self-host-playwright/","date":"2026-04-08T00:00:00.000Z","tags":["comparison"],"entities":["Browserbase","Playwright self-hosted"],"audience":"builder","technical_depth":"intermediate"},{"title":"Langfuse vs. Arize Phoenix","description":"Two of the strongest open-source observability options compared.","section":"Comparisons","collection":"comparisons","path":"/comparisons/langfuse-vs-arize-phoenix/","date":"2026-04-08T00:00:00.000Z","tags":["comparison"],"entities":["Langfuse","Phoenix"],"audience":"builder","technical_depth":"intermediate"},{"title":"LangGraph vs. CrewAI","description":"Graph-based vs. role-based multi-agent orchestration. When to pick which.","section":"Comparisons","collection":"comparisons","path":"/comparisons/langgraph-vs-crewai/","date":"2026-04-08T00:00:00.000Z","tags":["comparison"],"entities":["LangGraph","CrewAI"],"audience":"builder","technical_depth":"intermediate"},{"title":"MCP vs. vendor tool formats","description":"When the protocol pays for itself, and when it does not.","section":"Comparisons","collection":"comparisons","path":"/comparisons/mcp-vs-vendor-tool-formats/","date":"2026-04-08T00:00:00.000Z","tags":["comparison"],"entities":["MCP","OpenAI tool calls","Anthropic tools"],"audience":"builder","technical_depth":"intermediate"},{"title":"Decoupled planner-critic agents outperform monolithic planners on long tasks","description":"Splitting planning and critique into specialized models with structured exchange yields a 14-point lift on multi-day research tasks.","section":"Research","collection":"research","path":"/research/planner-critic-decoupling/","date":"2026-04-04T10:00:00.000Z","tags":["planning","critic","architecture"],"entities":["MIT CSAIL"],"audience":"researcher","technical_depth":"deep","agent_type":"planner","stack_layer":"orchestration","maturity":"experiment"},{"title":"Agent","description":"A system that decides which actions to take by combining a model with tools and memory.","section":"Glossary","collection":"glossary","path":"/glossary/agent/","date":"2026-04-01T00:00:00.000Z","tags":["glossary"],"entities":[],"audience":"general","technical_depth":"intro"},{"title":"Anthropic","description":"Frontier model lab; ships Claude and Computer Use.","section":"Companies","collection":"companies","path":"/companies/anthropic/","date":"2026-04-01T00:00:00.000Z","tags":["company"],"entities":["Anthropic"],"audience":"general","technical_depth":"intro"},{"title":"Browserbase","description":"Hosted browser infrastructure for agent automation.","section":"Companies","collection":"companies","path":"/companies/browserbase-co/","date":"2026-04-01T00:00:00.000Z","tags":["company"],"entities":["Browserbase"],"audience":"general","technical_depth":"intro"},{"title":"Claude Sonnet 4.5","description":"Anthropic's mid-tier frontier model with strong tool-use and Computer Use support.","section":"Models","collection":"models","path":"/models/claude-sonnet-4-5/","date":"2026-04-01T00:00:00.000Z","tags":["model"],"entities":["Anthropic"],"audience":"builder","technical_depth":"intermediate"},{"title":"GAIA","description":"General AI assistants benchmark.","section":"Benchmarks","collection":"benchmarks","path":"/benchmarks/gaia/","date":"2026-04-01T00:00:00.000Z","tags":["benchmark"],"entities":["GAIA"],"audience":"researcher","technical_depth":"intermediate"},{"title":"Gemini 2.5 Pro","description":"Google's flagship long-context model.","section":"Models","collection":"models","path":"/models/gemini-2-5-pro/","date":"2026-04-01T00:00:00.000Z","tags":["model"],"entities":["Google"],"audience":"builder","technical_depth":"intermediate"},{"title":"Google DeepMind","description":"Gemini frontier models and Vertex AI agent infrastructure.","section":"Companies","collection":"companies","path":"/companies/google/","date":"2026-04-01T00:00:00.000Z","tags":["company"],"entities":["Google DeepMind"],"audience":"general","technical_depth":"intro"},{"title":"GPT-5","description":"OpenAI's top-tier reasoning model.","section":"Models","collection":"models","path":"/models/gpt-5/","date":"2026-04-01T00:00:00.000Z","tags":["model"],"entities":["OpenAI"],"audience":"builder","technical_depth":"intermediate"},{"title":"Handoff","description":"The transfer of control or state from one agent to another, or from an agent to a human.","section":"Glossary","collection":"glossary","path":"/glossary/handoff/","date":"2026-04-01T00:00:00.000Z","tags":["glossary"],"entities":[],"audience":"general","technical_depth":"intro"},{"title":"LangChain","description":"Maintainers of LangChain, LangGraph, and LangSmith.","section":"Companies","collection":"companies","path":"/companies/langchain/","date":"2026-04-01T00:00:00.000Z","tags":["company"],"entities":["LangChain"],"audience":"general","technical_depth":"intro"},{"title":"Llama 4 405B","description":"Meta's open-weights frontier model.","section":"Models","collection":"models","path":"/models/llama-4-405b/","date":"2026-04-01T00:00:00.000Z","tags":["model"],"entities":["Meta"],"audience":"builder","technical_depth":"intermediate"},{"title":"Long-horizon task","description":"A task spanning many steps over hours or days, requiring durable state and memory.","section":"Glossary","collection":"glossary","path":"/glossary/long-horizon/","date":"2026-04-01T00:00:00.000Z","tags":["glossary"],"entities":[],"audience":"general","technical_depth":"intro"},{"title":"Mistral AI","description":"European frontier lab with strong open-weights releases.","section":"Companies","collection":"companies","path":"/companies/mistral/","date":"2026-04-01T00:00:00.000Z","tags":["company"],"entities":["Mistral AI"],"audience":"general","technical_depth":"intro"},{"title":"Mistral Large 3","description":"European-residency frontier model with strong tool-calling.","section":"Models","collection":"models","path":"/models/mistral-large-3/","date":"2026-04-01T00:00:00.000Z","tags":["model"],"entities":["Mistral"],"audience":"builder","technical_depth":"intermediate"},{"title":"Model Context Protocol (MCP)","description":"An open protocol for exposing tools and context to LLMs through a standard interface.","section":"Glossary","collection":"glossary","path":"/glossary/mcp/","date":"2026-04-01T00:00:00.000Z","tags":["glossary"],"entities":[],"audience":"general","technical_depth":"intro"},{"title":"Multi-agent system","description":"A system of two or more agents that exchange messages or hand off tasks.","section":"Glossary","collection":"glossary","path":"/glossary/multi-agent/","date":"2026-04-01T00:00:00.000Z","tags":["glossary"],"entities":[],"audience":"general","technical_depth":"intro"},{"title":"OpenAI","description":"Ships GPT-class models, Swarm orchestration, and the most-deployed agent runtimes.","section":"Companies","collection":"companies","path":"/companies/openai/","date":"2026-04-01T00:00:00.000Z","tags":["company"],"entities":["OpenAI"],"audience":"general","technical_depth":"intro"},{"title":"OSWorld","description":"Computer-use benchmark spanning OS, browser, and productivity apps.","section":"Benchmarks","collection":"benchmarks","path":"/benchmarks/osworld/","date":"2026-04-01T00:00:00.000Z","tags":["benchmark"],"entities":["OSWorld"],"audience":"researcher","technical_depth":"intermediate"},{"title":"Planner–critic architecture","description":"A pattern where a planner proposes steps and a critic prunes or revises them.","section":"Glossary","collection":"glossary","path":"/glossary/planner-critic/","date":"2026-04-01T00:00:00.000Z","tags":["glossary"],"entities":[],"audience":"general","technical_depth":"intro"},{"title":"Qwen 3 72B","description":"Alibaba's open-weights model with strong multilingual performance.","section":"Models","collection":"models","path":"/models/qwen-3-72b/","date":"2026-04-01T00:00:00.000Z","tags":["model"],"entities":["Alibaba"],"audience":"builder","technical_depth":"intermediate"},{"title":"Replay-based evaluation","description":"Scoring agent candidates against captured real-world sessions with held-out outcomes.","section":"Glossary","collection":"glossary","path":"/glossary/replay-eval/","date":"2026-04-01T00:00:00.000Z","tags":["glossary"],"entities":[],"audience":"general","technical_depth":"intro"},{"title":"Retrieval-augmented generation (RAG)","description":"Retrieving documents at inference time and conditioning generation on them.","section":"Glossary","collection":"glossary","path":"/glossary/rag/","date":"2026-04-01T00:00:00.000Z","tags":["glossary"],"entities":[],"audience":"general","technical_depth":"intro"},{"title":"Sandbox","description":"An isolated execution environment for running agent code or browser actions safely.","section":"Glossary","collection":"glossary","path":"/glossary/sandbox/","date":"2026-04-01T00:00:00.000Z","tags":["glossary"],"entities":[],"audience":"general","technical_depth":"intro"},{"title":"SWE-bench Verified","description":"Verified subset of SWE-bench, the canonical coding-agent benchmark.","section":"Benchmarks","collection":"benchmarks","path":"/benchmarks/swe-bench-verified/","date":"2026-04-01T00:00:00.000Z","tags":["benchmark"],"entities":["SWE-bench Verified"],"audience":"researcher","technical_depth":"intermediate"},{"title":"Tool use","description":"The pattern of an LLM invoking external functions to gather data or take action.","section":"Glossary","collection":"glossary","path":"/glossary/tool-use/","date":"2026-04-01T00:00:00.000Z","tags":["glossary"],"entities":[],"audience":"general","technical_depth":"intro"},{"title":"WebArena","description":"Web-navigation benchmark for browser agents.","section":"Benchmarks","collection":"benchmarks","path":"/benchmarks/webarena/","date":"2026-04-01T00:00:00.000Z","tags":["benchmark"],"entities":["WebArena"],"audience":"researcher","technical_depth":"intermediate"},{"title":"τ-bench","description":"Tool-use evaluation across realistic transactional workflows.","section":"Benchmarks","collection":"benchmarks","path":"/benchmarks/tau-bench/","date":"2026-04-01T00:00:00.000Z","tags":["benchmark"],"entities":["τ-bench"],"audience":"researcher","technical_depth":"intermediate"},{"title":"The case for replay-based agent evaluation","description":"Static benchmarks miss the failure modes that matter in production. This paper argues for replay sets — captured user sessions scored against a held-out outcome.","section":"Research","collection":"research","path":"/research/agent-eval-replay-sets/","date":"2026-03-30T08:45:00.000Z","tags":["evaluation","replay","production"],"entities":["Berkeley","AnyScale"],"audience":"builder","technical_depth":"intermediate","agent_type":"general","stack_layer":"eval","maturity":"production"}]