Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion gateway/src/main_service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ use crate::{
NodeData, NodeStatus, PortPolicy, WaveKvSyncService,
},
models::{InstanceInfo, PortPolicyView, WgConf},
proxy::{create_acceptor_with_cert_resolver, AddressGroup, AddressInfo},
proxy::{create_acceptor_with_cert_resolver, AddressGroup, AddressInfo, AppAddressResolver},
};

mod auth_client;
Expand Down Expand Up @@ -89,6 +89,9 @@ pub struct ProxyInner {
/// allow traffic.
pub(crate) port_policy_tx: UnboundedSender<String>,
handshake_cache: Arc<LatestHandshakesCache>,
/// Shared DNS resolver for SNI TXT lookups. Reusing one resolver lets the
/// hickory DNS cache work across proxy connections.
pub(crate) app_address_resolver: Arc<AppAddressResolver>,
}

const HANDSHAKE_CACHE_TTL: Duration = Duration::from_secs(30);
Expand Down Expand Up @@ -290,6 +293,10 @@ impl ProxyInner {
let h2_acceptor =
create_acceptor_with_cert_resolver(&config.proxy, cert_resolver.clone(), true)
.context("failed to create h2 acceptor with cert resolver")?;
let app_address_resolver = Arc::new(AppAddressResolver::new(
config.proxy.app_address_ns_prefix.clone(),
config.proxy.app_address_ns_compat,
));

Ok(Self {
config,
Expand All @@ -306,6 +313,7 @@ impl ProxyInner {
https_config: Some(https_config),
port_policy_tx,
handshake_cache,
app_address_resolver,
})
}

Expand Down
1 change: 1 addition & 0 deletions gateway/src/proxy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ use std::{
use anyhow::{bail, Context, Result};
use or_panic::ResultOrPanic;
use sni::extract_sni;
pub(crate) use tls_passthough::AppAddressResolver;
pub(crate) use tls_terminate::create_acceptor_with_cert_resolver;
use tokio::{
io::AsyncReadExt,
Expand Down
134 changes: 92 additions & 42 deletions gateway/src/proxy/tls_passthough.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@

use std::fmt::Debug;
use std::sync::atomic::Ordering;
use std::time::Duration;

use anyhow::{bail, Context, Result};
use hickory_resolver::{lookup::TxtLookup, TokioAsyncResolver};
use proxy_protocol::ProxyHeader;
use tokio::sync::OnceCell;
use tokio::{io::AsyncWriteExt, net::TcpStream, task::JoinSet, time::timeout};
use tracing::{debug, info, warn};

Expand All @@ -21,6 +24,9 @@ use super::{
AddressGroup,
};

const APP_ADDRESS_DNS_CACHE_SIZE: usize = 256;
const APP_ADDRESS_NEGATIVE_CACHE_TTL: Duration = Duration::from_secs(10);

#[derive(Debug)]
struct AppAddress {
app_id: String,
Expand All @@ -39,38 +45,95 @@ impl AppAddress {
}
}

/// resolve app address by sni
async fn resolve_app_address(prefix: &str, sni: &str, compat: bool) -> Result<AppAddress> {
/// Shared resolver for SNI -> app address TXT lookups.
///
/// Hickory's resolver already has an internal TTL-aware DNS cache. The old
/// code created a new resolver per proxy connection, which defeated that cache.
/// Keeping a resolver in `ProxyInner` makes TXT caching effective across
/// connections without introducing a separate cache invalidation policy here.
pub(crate) struct AppAddressResolver {
prefix: String,
compat: bool,
resolver: OnceCell<TokioAsyncResolver>,
}

impl AppAddressResolver {
pub(crate) fn new(prefix: String, compat: bool) -> Self {
Self {
prefix,
compat,
resolver: OnceCell::new(),
}
}

async fn resolver(&self) -> Result<&TokioAsyncResolver> {
self.resolver
.get_or_try_init(|| async { app_address_tokio_resolver_from_system_conf() })
.await
}

async fn resolve(&self, sni: &str) -> Result<AppAddress> {
resolve_app_address(self.resolver().await?, &self.prefix, sni, self.compat).await
}
}

fn app_address_tokio_resolver_from_system_conf() -> Result<TokioAsyncResolver> {
let (config, mut options) = hickory_resolver::system_conf::read_system_conf()
.context("failed to read system dns config")?;

// App-address records may appear shortly after a CVM/app is registered.
// Reusing one resolver enables positive TXT caching, but we do not want a
// transient NXDOMAIN/NODATA response to hide a newly-added app for too
// long. Keep positive caching TTL-aware and cap negative caching.
options.cache_size = APP_ADDRESS_DNS_CACHE_SIZE;
options.negative_min_ttl = Some(Duration::ZERO);
options.negative_max_ttl = Some(APP_ADDRESS_NEGATIVE_CACHE_TTL);

Ok(TokioAsyncResolver::tokio(config, options))
}

fn parse_lookup(lookup: &TxtLookup, sni: &str, txt_domain: &str) -> Result<Option<AppAddress>> {
let Some(txt_record) = lookup.iter().next() else {
return Ok(None);
};
let Some(data) = txt_record.txt_data().first() else {
return Ok(None);
};
AppAddress::parse(data)
.with_context(|| format!("failed to parse app address for {sni} via {txt_domain}"))
.map(Some)
}

/// Resolve app address by SNI. `resolver` is shared so its DNS cache is reused.
async fn resolve_app_address(
resolver: &TokioAsyncResolver,
prefix: &str,
sni: &str,
compat: bool,
) -> Result<AppAddress> {
let txt_domain = format!("{prefix}.{sni}");
let resolver = hickory_resolver::AsyncResolver::tokio_from_system_conf()
.context("failed to create dns resolver")?;

if compat && prefix != "_tapp-address" {
let txt_domain_legacy = format!("_tapp-address.{sni}");
let (lookup, lookup_legacy) = tokio::join!(
resolver.txt_lookup(txt_domain),
resolver.txt_lookup(txt_domain_legacy),
resolver.txt_lookup(&txt_domain),
resolver.txt_lookup(&txt_domain_legacy),
);
for lookup in [lookup, lookup_legacy] {
for (lookup, domain) in [
(lookup, txt_domain.as_str()),
(lookup_legacy, txt_domain_legacy.as_str()),
] {
let Ok(lookup) = lookup else {
continue;
};
let Some(txt_record) = lookup.iter().next() else {
continue;
};
let Some(data) = txt_record.txt_data().first() else {
continue;
};
return AppAddress::parse(data)
.with_context(|| format!("failed to parse app address for {sni}"));
}
} else if let Ok(lookup) = resolver.txt_lookup(txt_domain).await {
if let Some(txt_record) = lookup.iter().next() {
if let Some(data) = txt_record.txt_data().first() {
return AppAddress::parse(data)
.with_context(|| format!("failed to parse app address for {sni}"));
if let Some(app_address) = parse_lookup(&lookup, sni, domain)? {
return Ok(app_address);
}
}
} else if let Ok(lookup) = resolver.txt_lookup(&txt_domain).await {
if let Some(app_address) = parse_lookup(&lookup, sni, &txt_domain)? {
return Ok(app_address);
}
}

// wildcard fallback: try {prefix}-wildcard.{parent_domain}
Expand All @@ -82,17 +145,8 @@ async fn resolve_app_address(prefix: &str, sni: &str, compat: bool) -> Result<Ap
.with_context(|| {
format!("failed to lookup wildcard app address for {sni} via {wildcard_domain}")
})?;
let txt_record = lookup
.iter()
.next()
.with_context(|| format!("no txt record found for {sni} via {wildcard_domain}"))?;
let data = txt_record
.txt_data()
.first()
.with_context(|| format!("no data in txt record for {sni} via {wildcard_domain}"))?;
return AppAddress::parse(data).with_context(|| {
format!("failed to parse app address for {sni} via {wildcard_domain}")
});
return parse_lookup(&lookup, sni, &wildcard_domain)?
.with_context(|| format!("no txt record found for {sni} via {wildcard_domain}"));
}

anyhow::bail!("failed to resolve app address for {sni}");
Expand All @@ -105,10 +159,8 @@ pub(crate) async fn proxy_with_sni(
buffer: Vec<u8>,
sni: &str,
) -> Result<()> {
let ns_prefix = &state.config.proxy.app_address_ns_prefix;
let compat = state.config.proxy.app_address_ns_compat;
let dns_timeout = state.config.proxy.timeouts.dns_resolve;
let addr = timeout(dns_timeout, resolve_app_address(ns_prefix, sni, compat))
let addr = timeout(dns_timeout, state.app_address_resolver.resolve(sni))
.await
.with_context(|| format!("DNS TXT resolve timeout for {sni}"))?
.with_context(|| format!("failed to resolve app address for {sni}"))?;
Expand Down Expand Up @@ -221,13 +273,11 @@ mod tests {

#[tokio::test]
async fn test_resolve_app_address() {
let app_addr = resolve_app_address(
"_dstack-app-address",
"3327603e03f5bd1f830812ca4a789277fc31f577.app.dstack.org",
false,
)
.await
.unwrap();
let resolver = AppAddressResolver::new("_dstack-app-address".to_string(), false);
let app_addr = resolver
.resolve("3327603e03f5bd1f830812ca4a789277fc31f577.app.dstack.org")
.await
.unwrap();
assert_eq!(app_addr.app_id, "3327603e03f5bd1f830812ca4a789277fc31f577");
assert_eq!(app_addr.port, 8090);
}
Expand Down
4 changes: 4 additions & 0 deletions gateway/test-run/TESTING.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# SPDX-FileCopyrightText: © 2026 Phala Network <dstack@phala.network>
#
# SPDX-License-Identifier: Apache-2.0

# Gateway test plan

This document records the local checks used for the gateway handshake-cache
Expand Down
5 changes: 5 additions & 0 deletions gateway/test-run/test_suite.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
# SPDX-License-Identifier: Apache-2.0

# WaveKV integration test script
#
# This legacy integration script predates the repository-wide shellcheck hook.
# Keep the existing style warnings suppressed so small harness fixes do not
# require a full script rewrite.
# shellcheck disable=SC2015,SC2034,SC2086,SC2155,SC2164

# Don't use set -e as it causes issues with cleanup and test flow
# set -e
Expand Down
Loading