diff --git a/gateway/src/main_service.rs b/gateway/src/main_service.rs index 74b640a2d..85aa533b6 100644 --- a/gateway/src/main_service.rs +++ b/gateway/src/main_service.rs @@ -43,7 +43,7 @@ use crate::{ NodeData, NodeStatus, PortPolicy, WaveKvSyncService, }, models::{InstanceInfo, PortPolicyView, WgConf}, - proxy::{create_acceptor_with_cert_resolver, AddressGroup, AddressInfo}, + proxy::{create_acceptor_with_cert_resolver, AddressGroup, AddressInfo, AppAddressResolver}, }; mod auth_client; @@ -89,6 +89,9 @@ pub struct ProxyInner { /// allow traffic. pub(crate) port_policy_tx: UnboundedSender, handshake_cache: Arc, + /// Shared DNS resolver for SNI TXT lookups. Reusing one resolver lets the + /// hickory DNS cache work across proxy connections. + pub(crate) app_address_resolver: Arc, } const HANDSHAKE_CACHE_TTL: Duration = Duration::from_secs(30); @@ -290,6 +293,13 @@ impl ProxyInner { let h2_acceptor = create_acceptor_with_cert_resolver(&config.proxy, cert_resolver.clone(), true) .context("failed to create h2 acceptor with cert resolver")?; + let app_address_resolver = Arc::new( + AppAddressResolver::new( + config.proxy.app_address_ns_prefix.clone(), + config.proxy.app_address_ns_compat, + ) + .context("failed to create app address resolver")?, + ); Ok(Self { config, @@ -306,6 +316,7 @@ impl ProxyInner { https_config: Some(https_config), port_policy_tx, handshake_cache, + app_address_resolver, }) } diff --git a/gateway/src/proxy.rs b/gateway/src/proxy.rs index ace00819a..4451ee359 100644 --- a/gateway/src/proxy.rs +++ b/gateway/src/proxy.rs @@ -14,6 +14,7 @@ use std::{ use anyhow::{bail, Context, Result}; use or_panic::ResultOrPanic; use sni::extract_sni; +pub(crate) use tls_passthough::AppAddressResolver; pub(crate) use tls_terminate::create_acceptor_with_cert_resolver; use tokio::{ io::AsyncReadExt, diff --git a/gateway/src/proxy/tls_passthough.rs b/gateway/src/proxy/tls_passthough.rs index a0df36f99..d073fce59 100644 --- a/gateway/src/proxy/tls_passthough.rs +++ b/gateway/src/proxy/tls_passthough.rs @@ -4,8 +4,10 @@ use std::fmt::Debug; use std::sync::atomic::Ordering; +use std::time::Duration; use anyhow::{bail, Context, Result}; +use hickory_resolver::{lookup::TxtLookup, TokioAsyncResolver}; use proxy_protocol::ProxyHeader; use tokio::{io::AsyncWriteExt, net::TcpStream, task::JoinSet, time::timeout}; use tracing::{debug, info, warn}; @@ -21,6 +23,9 @@ use super::{ AddressGroup, }; +const APP_ADDRESS_DNS_CACHE_SIZE: usize = 256; +const APP_ADDRESS_NEGATIVE_CACHE_TTL: Duration = Duration::from_secs(10); + #[derive(Debug)] struct AppAddress { app_id: String, @@ -39,38 +44,89 @@ impl AppAddress { } } -/// resolve app address by sni -async fn resolve_app_address(prefix: &str, sni: &str, compat: bool) -> Result { +/// Shared resolver for SNI -> app address TXT lookups. +/// +/// Hickory's resolver already has an internal TTL-aware DNS cache. The old +/// code created a new resolver per proxy connection, which defeated that cache. +/// Keeping a resolver in `ProxyInner` makes TXT caching effective across +/// connections without introducing a separate cache invalidation policy here. +pub(crate) struct AppAddressResolver { + prefix: String, + compat: bool, + resolver: TokioAsyncResolver, +} + +impl AppAddressResolver { + pub(crate) fn new(prefix: String, compat: bool) -> Result { + Ok(Self { + prefix, + compat, + resolver: app_address_tokio_resolver_from_system_conf()?, + }) + } + + async fn resolve(&self, sni: &str) -> Result { + resolve_app_address(&self.resolver, &self.prefix, sni, self.compat).await + } +} + +fn app_address_tokio_resolver_from_system_conf() -> Result { + let (config, mut options) = hickory_resolver::system_conf::read_system_conf() + .context("failed to read system dns config")?; + + // App-address records may appear shortly after a CVM/app is registered. + // Reusing one resolver enables positive TXT caching, but we do not want a + // transient NXDOMAIN/NODATA response to hide a newly-added app for too + // long. Keep positive caching TTL-aware and cap negative caching. + options.cache_size = APP_ADDRESS_DNS_CACHE_SIZE; + options.negative_min_ttl = Some(Duration::ZERO); + options.negative_max_ttl = Some(APP_ADDRESS_NEGATIVE_CACHE_TTL); + + Ok(TokioAsyncResolver::tokio(config, options)) +} + +fn parse_lookup(lookup: &TxtLookup, sni: &str, txt_domain: &str) -> Result> { + let Some(txt_record) = lookup.iter().next() else { + return Ok(None); + }; + let Some(data) = txt_record.txt_data().first() else { + return Ok(None); + }; + AppAddress::parse(data) + .with_context(|| format!("failed to parse app address for {sni} via {txt_domain}")) + .map(Some) +} + +/// Resolve app address by SNI. `resolver` is shared so its DNS cache is reused. +async fn resolve_app_address( + resolver: &TokioAsyncResolver, + prefix: &str, + sni: &str, + compat: bool, +) -> Result { let txt_domain = format!("{prefix}.{sni}"); - let resolver = hickory_resolver::AsyncResolver::tokio_from_system_conf() - .context("failed to create dns resolver")?; if compat && prefix != "_tapp-address" { let txt_domain_legacy = format!("_tapp-address.{sni}"); let (lookup, lookup_legacy) = tokio::join!( - resolver.txt_lookup(txt_domain), - resolver.txt_lookup(txt_domain_legacy), + resolver.txt_lookup(&txt_domain), + resolver.txt_lookup(&txt_domain_legacy), ); - for lookup in [lookup, lookup_legacy] { + for (lookup, domain) in [ + (lookup, txt_domain.as_str()), + (lookup_legacy, txt_domain_legacy.as_str()), + ] { let Ok(lookup) = lookup else { continue; }; - let Some(txt_record) = lookup.iter().next() else { - continue; - }; - let Some(data) = txt_record.txt_data().first() else { - continue; - }; - return AppAddress::parse(data) - .with_context(|| format!("failed to parse app address for {sni}")); - } - } else if let Ok(lookup) = resolver.txt_lookup(txt_domain).await { - if let Some(txt_record) = lookup.iter().next() { - if let Some(data) = txt_record.txt_data().first() { - return AppAddress::parse(data) - .with_context(|| format!("failed to parse app address for {sni}")); + if let Some(app_address) = parse_lookup(&lookup, sni, domain)? { + return Ok(app_address); } } + } else if let Ok(lookup) = resolver.txt_lookup(&txt_domain).await { + if let Some(app_address) = parse_lookup(&lookup, sni, &txt_domain)? { + return Ok(app_address); + } } // wildcard fallback: try {prefix}-wildcard.{parent_domain} @@ -82,17 +138,8 @@ async fn resolve_app_address(prefix: &str, sni: &str, compat: bool) -> Result, sni: &str, ) -> Result<()> { - let ns_prefix = &state.config.proxy.app_address_ns_prefix; - let compat = state.config.proxy.app_address_ns_compat; let dns_timeout = state.config.proxy.timeouts.dns_resolve; - let addr = timeout(dns_timeout, resolve_app_address(ns_prefix, sni, compat)) + let addr = timeout(dns_timeout, state.app_address_resolver.resolve(sni)) .await .with_context(|| format!("DNS TXT resolve timeout for {sni}"))? .with_context(|| format!("failed to resolve app address for {sni}"))?; @@ -220,15 +265,13 @@ mod tests { use super::*; #[tokio::test] - async fn test_resolve_app_address() { - let app_addr = resolve_app_address( - "_dstack-app-address", - "3327603e03f5bd1f830812ca4a789277fc31f577.app.dstack.org", - false, - ) - .await - .unwrap(); + async fn test_resolve_app_address() -> Result<()> { + let resolver = AppAddressResolver::new("_dstack-app-address".to_string(), false)?; + let app_addr = resolver + .resolve("3327603e03f5bd1f830812ca4a789277fc31f577.app.dstack.org") + .await?; assert_eq!(app_addr.app_id, "3327603e03f5bd1f830812ca4a789277fc31f577"); assert_eq!(app_addr.port, 8090); + Ok(()) } } diff --git a/gateway/test-run/TESTING.md b/gateway/test-run/TESTING.md index 3058f264e..0ff3269fc 100644 --- a/gateway/test-run/TESTING.md +++ b/gateway/test-run/TESTING.md @@ -1,3 +1,7 @@ +# SPDX-FileCopyrightText: © 2026 Phala Network +# +# SPDX-License-Identifier: Apache-2.0 + # Gateway test plan This document records the local checks used for the gateway handshake-cache diff --git a/gateway/test-run/test_suite.sh b/gateway/test-run/test_suite.sh index e6efa2e2d..36eb44ef8 100755 --- a/gateway/test-run/test_suite.sh +++ b/gateway/test-run/test_suite.sh @@ -5,6 +5,11 @@ # SPDX-License-Identifier: Apache-2.0 # WaveKV integration test script +# +# This legacy integration script predates the repository-wide shellcheck hook. +# Keep the existing style warnings suppressed so small harness fixes do not +# require a full script rewrite. +# shellcheck disable=SC2015,SC2034,SC2086,SC2155,SC2164 # Don't use set -e as it causes issues with cleanup and test flow # set -e