diff --git a/README.md b/README.md index 0d54c05c1..6b0e5ef0f 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,7 @@ API and command-line option may change frequently.*** - [ERNIE-Image](./docs/ernie_image.md) - [Boogu Image](./docs/boogu_image.md) - [Krea2](./docs/krea2.md) + - [SeFi-Image](./docs/sefi_image.md) - [HiDream-O1-Image](./docs/hidream_o1_image.md) - [Ideogram4](./docs/ideogram4.md) - Image Edit Models diff --git a/assets/sefi_image/example.png b/assets/sefi_image/example.png new file mode 100644 index 000000000..14c53b815 Binary files /dev/null and b/assets/sefi_image/example.png differ diff --git a/docs/sefi_image.md b/docs/sefi_image.md new file mode 100644 index 000000000..e0e12fa81 --- /dev/null +++ b/docs/sefi_image.md @@ -0,0 +1,50 @@ +# How to Use + +SeFi-Image uses a Flux2-style dual-time transformer (semantic + texture streams), the standard Flux2 VAE, and Qwen3-VL as the LLM text encoder. Tech report: [arXiv:2606.22568](https://arxiv.org/abs/2606.22568). + +## Download weights + +The SeFi-Image family ships in three scales (1B / 2B / 5B) and three families (Base / RL / turbo), all gated on Hugging Face under https://huggingface.co/SeFi-Image. + +- 1B and 2B variants pair with Qwen3-VL-2B-Instruct. +- 5B variants pair with Qwen3-VL-4B-Instruct. +- All variants use the standard Flux2 VAE (`flux2_ae.safetensors` from https://huggingface.co/black-forest-labs/FLUX.2-dev). + +Convert the transformer and text encoder to sd.cpp safetensors: + +```bash +python3 script/convert_sefi.py /sefi__.safetensors +python3 script/convert_qwen3_vl.py /Qwen3-VL-XB-Instruct /qwen3_vl_b.safetensors +``` + +## Variant defaults + +| Family | timestep_shift_alpha | steps | cfg-scale | +|---|---|---|---| +| Base | 0.3 | 50 | 4.0 | +| RL | 0.3 | 50 | 4.0 | +| turbo | 1.0 | 4 | 1.0 | + +The dispatcher picks `alpha` from the filename (`turbo` substring => 1.0, otherwise 0.3). Override via `--extra-sample-args sefi_alpha=` or `sefi_delta_t=`. + +## Examples + +### 1B / 2B turbo + +``` +./build/bin/sd-cli --diffusion-model /path/to/sefi_1b_turbo.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_2b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 1.0 --steps 4 -W 1024 -H 1024 -s 42 --diffusion-fa --offload-to-cpu -o out.png +``` + +### 1B / 2B base + +``` +./build/bin/sd-cli --diffusion-model /path/to/sefi_1b_base.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_2b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 4.0 --steps 50 -W 1024 -H 1024 -s 42 --diffusion-fa --offload-to-cpu -o out.png +``` + +### 5B (needs streaming on 12 GiB VRAM) + +``` +./build/bin/sd-cli --diffusion-model /path/to/sefi_5b_turbo.safetensors --vae /path/to/flux2_ae.safetensors --llm /path/to/qwen3_vl_4b.safetensors -p "a photograph of an orange tabby cat sitting on a couch" --cfg-scale 1.0 --steps 4 -W 1024 -H 1024 -s 42 --diffusion-fa --max-vram 8 --stream-layers --offload-to-cpu -o out.png +``` + +SeFi-Image 5B turbo example diff --git a/include/stable-diffusion.h b/include/stable-diffusion.h index 7e21d6624..51494d508 100644 --- a/include/stable-diffusion.h +++ b/include/stable-diffusion.h @@ -81,6 +81,7 @@ enum prediction_t { FLOW_PRED, FLUX_FLOW_PRED, FLUX2_FLOW_PRED, + SEFI_FLOW_PRED, PREDICTION_COUNT }; diff --git a/script/convert_qwen3_vl.py b/script/convert_qwen3_vl.py new file mode 100644 index 000000000..d34aeb5ff --- /dev/null +++ b/script/convert_qwen3_vl.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +"""Convert a Qwen3-VL HF safetensors checkpoint into a sd.cpp-loadable form. + +The HF dump prefixes text-tower keys with ``model.language_model.`` and +vision-tower keys with ``model.visual.``. sd.cpp expects ``model.`` for +the text side; the vision side is converted by sd.cpp's own +``convert_qwen3_vl_vision_name`` and is left as-is here. + +Operates on raw safetensors bytes so any dtype (BF16/F16/F32) is preserved. + +Usage: + python3 script/convert_qwen3_vl.py +""" + +import argparse +import json +import os +import struct +import sys + + +def rewrite_key(key: str) -> str: + if key.startswith("model.language_model."): + return "model." + key[len("model.language_model."):] + return key + + +def read_safetensors_header(path: str): + with open(path, "rb") as f: + hdr_len = struct.unpack(" 0: + chunk = src.read(min(8 * 1024 * 1024, remaining)) + if not chunk: + raise IOError(f"Truncated tensor in {shard_path}") + out.write(chunk) + remaining -= len(chunk) + + +def main(): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("input", help="HF Qwen3-VL directory or single safetensors file") + parser.add_argument("output", help="Output single safetensors path") + args = parser.parse_args() + + entries = stage_tensors(args.input) + print(f"Tensors: {len(entries)}") + print(f"Writing -> {args.output}") + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + write_consolidated(args.output, entries) + print(f"Done. Output size: {os.path.getsize(args.output) / 1e9:.2f} GB") + + +if __name__ == "__main__": + main() diff --git a/script/convert_sefi.py b/script/convert_sefi.py new file mode 100644 index 000000000..d3c01570c --- /dev/null +++ b/script/convert_sefi.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +"""Convert a SeFi-Image diffusers checkpoint into a single sd.cpp-compatible safetensors. + +Operates on raw safetensors bytes so any dtype (BF16, F32, ...) is preserved exactly. +No numpy or torch dependency required. + +Usage: + python3 script/convert_sefi.py +""" + +import argparse +import json +import os +import re +import struct +import sys + + +_LINEAR_TO_LIN = re.compile(r"\.linear\.") +_SHARED_MOD_PREFIXES = ( + "double_stream_modulation_img", + "double_stream_modulation_txt", + "single_stream_modulation", +) + + +def rewrite_transformer_key(key: str) -> str: + if key.startswith("backbone."): + key = key[len("backbone."):] + elif key.startswith("dual_time_embed."): + return "model.diffusion_model." + key + + if any(key.startswith(prefix + ".") for prefix in _SHARED_MOD_PREFIXES): + key = _LINEAR_TO_LIN.sub(".lin.", key, count=1) + + if key == "context_embedder.weight": + return "model.diffusion_model.txt_in.weight" + if key == "context_embedder.bias": + return "model.diffusion_model.txt_in.bias" + if key == "x_embedder.weight": + return "model.diffusion_model.img_in.weight" + if key == "x_embedder.bias": + return "model.diffusion_model.img_in.bias" + + if key == "proj_out.weight": + return "model.diffusion_model.final_layer.linear.weight" + if key == "proj_out.bias": + return "model.diffusion_model.final_layer.linear.bias" + if key == "norm_out.linear.weight": + return "model.diffusion_model.final_layer.adaLN_modulation.1.weight" + if key == "norm_out.linear.bias": + return "model.diffusion_model.final_layer.adaLN_modulation.1.bias" + + m = re.match(r"transformer_blocks\.(\d+)\.(.*)$", key) + if m: + return "model.diffusion_model." + _rewrite_double_stream(m.group(1), m.group(2)) + m = re.match(r"single_transformer_blocks\.(\d+)\.(.*)$", key) + if m: + return "model.diffusion_model." + _rewrite_single_stream(m.group(1), m.group(2)) + + return "model.diffusion_model." + key + + +def _rewrite_double_stream(idx: str, tail: str) -> str: + dst = f"double_blocks.{idx}." + mapping = { + "norm1.linear.weight": "img_mod.lin.weight", + "norm1_context.linear.weight": "txt_mod.lin.weight", + "attn.norm_q.weight": "img_attn.norm.query_norm.scale", + "attn.norm_k.weight": "img_attn.norm.key_norm.scale", + "attn.norm_added_q.weight": "txt_attn.norm.query_norm.scale", + "attn.norm_added_k.weight": "txt_attn.norm.key_norm.scale", + "attn.to_out.0.weight": "img_attn.proj.weight", + "attn.to_add_out.weight": "txt_attn.proj.weight", + "ff.net.0.proj.weight": "img_mlp.0.weight", + "ff.net.2.weight": "img_mlp.2.weight", + "ff_context.net.0.proj.weight": "txt_mlp.0.weight", + "ff_context.net.2.weight": "txt_mlp.2.weight", + "ff.linear_in.weight": "img_mlp.0.weight", + "ff.linear_out.weight": "img_mlp.2.weight", + "ff_context.linear_in.weight": "txt_mlp.0.weight", + "ff_context.linear_out.weight": "txt_mlp.2.weight", + } + return dst + mapping.get(tail, tail) + + +# QKV triplets to fuse on output: source tails -> target fused tail. +# Each tuple is (q_tail, k_tail, v_tail, fused_target_tail). +QKV_DOUBLE_TRIPLETS = [ + ("attn.to_q.weight", "attn.to_k.weight", "attn.to_v.weight", "img_attn.qkv.weight"), + ("attn.add_q_proj.weight", "attn.add_k_proj.weight", "attn.add_v_proj.weight", "txt_attn.qkv.weight"), +] + + +def _rewrite_single_stream(idx: str, tail: str) -> str: + dst = f"single_blocks.{idx}." + mapping = { + "norm.linear.weight": "modulation.lin.weight", + "attn.norm_q.weight": "norm.query_norm.scale", + "attn.norm_k.weight": "norm.key_norm.scale", + "attn.to_qkv_mlp_proj.weight": "linear1.weight", + "attn.to_out.weight": "linear2.weight", + } + return dst + mapping.get(tail, tail) + + + + +def read_safetensors_header(path: str): + """Return (header dict, data start byte offset).""" + with open(path, "rb") as f: + hdr_len = struct.unpack(" {tail: (key, shard_path, data_off, info)} + raw_others = [] + for shard_path in collect_shard_paths(section_dir, "diffusion_pytorch_model"): + hdr, data_off = read_safetensors_header(shard_path) + for key, info in hdr.items(): + if key == "__metadata__": + continue + m = re.match(r"backbone\.transformer_blocks\.(\d+)\.(.*)$", key) + if m and any(m.group(2) in trip[:3] for trip in QKV_DOUBLE_TRIPLETS): + idx = m.group(1) + raw_by_block.setdefault(idx, {})[m.group(2)] = (key, shard_path, data_off, info) + else: + raw_others.append((key, shard_path, data_off, info)) + + for key, shard_path, data_off, info in raw_others: + entries.append((rewrite_fn(key), shard_path, data_off, info)) + + for block_idx, tails in raw_by_block.items(): + for q_tail, k_tail, v_tail, fused_tail in QKV_DOUBLE_TRIPLETS: + if q_tail in tails and k_tail in tails and v_tail in tails: + q = tails[q_tail]; k = tails[k_tail]; v = tails[v_tail] + # Validate shapes match. + q_shape = q[3]["shape"]; k_shape = k[3]["shape"]; v_shape = v[3]["shape"] + if q_shape != k_shape or q_shape != v_shape: + raise ValueError(f"qkv shape mismatch at block {block_idx} {q_tail}: q={q_shape} k={k_shape} v={v_shape}") + fused_shape = [q_shape[0] * 3] + list(q_shape[1:]) + fused_info = { + "dtype": q[3]["dtype"], + "shape": fused_shape, + "_qkv_sources": [q, k, v], # pseudo field consumed by writer + } + entries.append((f"model.diffusion_model.double_blocks.{block_idx}.{fused_tail}", + None, None, fused_info)) + del tails[q_tail]; del tails[k_tail]; del tails[v_tail] + # Anything left in tails was an unmatched single - pass through. + for tail, payload in tails.items(): + entries.append((rewrite_fn(payload[0]),) + payload[1:]) + return entries + + +_DTYPE_BYTES = { + "BF16": 2, "F16": 2, "F32": 4, "F64": 8, + "U8": 1, "I8": 1, "I16": 2, "I32": 4, "I64": 8, + "BOOL": 1, +} + + +def _total_bytes(info: dict) -> int: + if "_qkv_sources" in info: + elems = 1 + for d in info["shape"]: + elems *= d + return elems * _DTYPE_BYTES[info["dtype"]] + start, end = info["data_offsets"] + return end - start + + +def write_consolidated(out_path: str, entries): + """Write a single safetensors file by streaming raw bytes from each shard. + + For qkv-fused entries, q/k/v are concatenated along axis 0 (row-major), so a + simple byte-level concatenation produces the correct fused layout for any + standard dtype. + """ + entries = sorted(entries, key=lambda e: e[0]) + + new_header = {} + cur_offset = 0 + for new_key, shard_path, data_off, info in entries: + size = _total_bytes(info) + new_header[new_key] = { + "dtype": info["dtype"], + "shape": info["shape"], + "data_offsets": [cur_offset, cur_offset + size], + } + cur_offset += size + + header_json = json.dumps(new_header, separators=(",", ":")).encode("utf-8") + pad = (-len(header_json)) % 8 + header_json = header_json + (b" " * pad) + + def copy_range(src_path, src_data_off, src_info, out): + start, end = src_info["data_offsets"] + with open(src_path, "rb") as src: + src.seek(src_data_off + start) + remaining = end - start + while remaining > 0: + chunk = src.read(min(8 * 1024 * 1024, remaining)) + if not chunk: + raise IOError(f"Truncated tensor in {src_path}") + out.write(chunk) + remaining -= len(chunk) + + with open(out_path, "wb") as out: + out.write(struct.pack(" {args.output}") + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + write_consolidated(args.output, transformer_entries) + print(f"Done. Output size: {os.path.getsize(args.output) / 1e9:.2f} GB") + + +if __name__ == "__main__": + main() diff --git a/src/conditioning/conditioner.hpp b/src/conditioning/conditioner.hpp index e037fe76b..f6a686fa6 100644 --- a/src/conditioning/conditioner.hpp +++ b/src/conditioning/conditioner.hpp @@ -1518,7 +1518,7 @@ struct LLMEmbedder : public Conditioner { arch = LLM::LLMArch::GPT_OSS_20B; } else if (sd_version_is_pid(version)) { arch = LLM::LLMArch::GEMMA2_2B; - } else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version) || sd_version_is_krea2(version)) { + } else if (sd_version_is_ideogram4(version) || sd_version_is_boogu_image(version) || sd_version_is_sefi_image(version) || sd_version_is_krea2(version)) { arch = LLM::LLMArch::QWEN3_VL; } else if (sd_version_is_z_image(version) || version == VERSION_OVIS_IMAGE || version == VERSION_FLUX2_KLEIN) { arch = LLM::LLMArch::QWEN3; @@ -1997,6 +1997,18 @@ struct LLMEmbedder : public Conditioner { prompt_attn_range.second = static_cast(prompt.size()); prompt += "<|im_end|>\n<|im_start|>assistant\n\n\n\n\n"; + } else if (sd_version_is_sefi_image(version)) { + prompt_template_encode_start_idx = 0; + min_length = 1024; + out_layers = {9, 18, 27}; + + prompt = "<|im_start|>user\n"; + + prompt_attn_range.first = static_cast(prompt.size()); + prompt += conditioner_params.text; + prompt_attn_range.second = static_cast(prompt.size()); + + prompt += "<|im_end|>\n<|im_start|>assistant\n"; } else if (version == VERSION_OVIS_IMAGE) { prompt_template_encode_start_idx = 28; min_length = prompt_template_encode_start_idx + 256; diff --git a/src/model.h b/src/model.h index cce309138..a53329bf8 100644 --- a/src/model.h +++ b/src/model.h @@ -49,6 +49,7 @@ enum SDVersion { VERSION_LONGCAT, VERSION_PID, VERSION_IDEOGRAM4, + VERSION_SEFI_IMAGE, VERSION_KREA2, VERSION_ESRGAN, VERSION_COUNT, @@ -187,6 +188,13 @@ static inline bool sd_version_is_ideogram4(SDVersion version) { return false; } +static inline bool sd_version_is_sefi_image(SDVersion version) { + if (version == VERSION_SEFI_IMAGE) { + return true; + } + return false; +} + static inline bool sd_version_is_krea2(SDVersion version) { if (version == VERSION_KREA2) { return true; @@ -202,7 +210,7 @@ static inline bool sd_version_uses_flux_vae(SDVersion version) { } static inline bool sd_version_uses_flux2_vae(SDVersion version) { - if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version)) { + if (sd_version_is_flux2(version) || sd_version_is_ernie_image(version) || sd_version_is_lens(version) || sd_version_is_ideogram4(version) || sd_version_is_sefi_image(version)) { return true; } return false; @@ -235,6 +243,7 @@ static inline bool sd_version_is_dit(SDVersion version) { sd_version_is_longcat(version) || sd_version_is_pid(version) || sd_version_is_ideogram4(version) || + sd_version_is_sefi_image(version) || sd_version_is_krea2(version)) { return true; } diff --git a/src/model/diffusion/flux.hpp b/src/model/diffusion/flux.hpp index b5e6c63bf..da2b24726 100644 --- a/src/model/diffusion/flux.hpp +++ b/src/model/diffusion/flux.hpp @@ -8,6 +8,7 @@ #include "model/common/rope.hpp" #include "model/diffusion/dit.hpp" #include "model/diffusion/model.hpp" +#include "model/diffusion/sefi_image.hpp" #include "model_loader.h" #define FLUX_GRAPH_SIZE 10240 @@ -26,6 +27,9 @@ namespace Flux { struct FluxConfig { SDVersion version = VERSION_FLUX; bool is_chroma = false; + bool is_sefi = false; + int64_t semantic_channels = 0; + float sefi_delta_t = 0.1f; int patch_size = 2; int64_t in_channels = 64; int64_t out_channels = 64; @@ -88,6 +92,21 @@ namespace Flux { config.share_modulation = true; config.ref_index_scale = 10.f; config.use_mlp_silu_act = true; + } else if (sd_version_is_sefi_image(version)) { + config.is_sefi = true; + config.semantic_channels = 16; + config.in_channels = 128 + config.semantic_channels; + config.patch_size = 1; + config.out_channels = 128 + config.semantic_channels; + config.mlp_ratio = 3.f; + config.theta = 2000; + config.axes_dim = {32, 32, 32, 32}; + config.vec_in_dim = 0; + config.qkv_bias = false; + config.disable_bias = true; + config.share_modulation = true; + config.ref_index_scale = 10.f; + config.use_mlp_silu_act = true; } else if (sd_version_is_longcat(version)) { config.context_in_dim = 3584; config.vec_in_dim = 0; @@ -681,14 +700,16 @@ namespace Flux { struct LastLayer : public GGMLBlock { bool prune_mod; + bool diffusers_adaLN_order = false; public: LastLayer(int64_t hidden_size, int64_t patch_size, int64_t out_channels, - bool prune_mod = false, - bool bias = true) - : prune_mod(prune_mod) { + bool prune_mod = false, + bool bias = true, + bool diffusers_adaLN_order = false) + : prune_mod(prune_mod), diffusers_adaLN_order(diffusers_adaLN_order) { blocks["norm_final"] = std::shared_ptr(new LayerNorm(hidden_size, 1e-06f, false)); blocks["linear"] = std::shared_ptr(new Linear(hidden_size, patch_size * patch_size * out_channels, bias)); if (!prune_mod) { @@ -723,8 +744,13 @@ namespace Flux { auto m = adaLN_modulation_1->forward(ctx, ggml_silu(ctx->ggml_ctx, c)); // [N, 2 * hidden_size] auto m_vec = ggml_ext_chunk(ctx->ggml_ctx, m, 2, 0); - shift = m_vec[0]; // [N, hidden_size] - scale = m_vec[1]; // [N, hidden_size] + if (diffusers_adaLN_order) { + scale = m_vec[0]; // diffusers AdaLayerNormContinuous: (scale, shift) + shift = m_vec[1]; + } else { + shift = m_vec[0]; // BFL flux: (shift, scale) + scale = m_vec[1]; + } } x = Flux::modulate(ctx->ggml_ctx, norm_final->forward(ctx, x), shift, scale); @@ -902,6 +928,8 @@ namespace Flux { } if (config.is_chroma) { blocks["distilled_guidance_layer"] = std::make_shared(config.in_dim, config.hidden_size); + } else if (config.is_sefi) { + blocks["dual_time_embed"] = std::make_shared(256, config.hidden_size); } else { blocks["time_in"] = std::make_shared(256, config.hidden_size, !config.disable_bias); if (config.vec_in_dim > 0) { @@ -957,7 +985,7 @@ namespace Flux { config.in_channels); } else { - blocks["final_layer"] = std::make_shared(config.hidden_size, 1, config.out_channels, config.is_chroma, !config.disable_bias); + blocks["final_layer"] = std::make_shared(config.hidden_size, 1, config.out_channels, config.is_chroma, !config.disable_bias, config.is_sefi); } if (config.share_modulation) { @@ -1027,6 +1055,11 @@ namespace Flux { if (y != nullptr) { txt_img_mask = ggml_pad(ctx->ggml_ctx, y, static_cast(img->ne[1]), 0, 0, 0); } + } else if (config.is_sefi) { + auto dual_time_embed = std::dynamic_pointer_cast(blocks["dual_time_embed"]); + auto timestep_sem = ggml_view_1d(ctx->ggml_ctx, timesteps, 1, 0); + auto timestep_tex = ggml_view_1d(ctx->ggml_ctx, timesteps, 1, ggml_element_size(timesteps)); + vec = dual_time_embed->forward(ctx, timestep_sem, timestep_tex); } else { auto time_in = std::dynamic_pointer_cast(blocks["time_in"]); vec = time_in->forward(ctx, ggml_ext_timestep_embedding(ctx->ggml_ctx, timesteps, 256, 10000, 1000.f)); @@ -1500,7 +1533,7 @@ namespace Flux { set_backend_tensor_data(mod_index_arange, mod_index_arange_vec.data()); } std::set txt_arange_dims; - if (sd_version_is_flux2(version)) { + if (sd_version_is_flux2(version) || sd_version_is_sefi_image(version)) { txt_arange_dims = {3}; increase_ref_index = true; } else if (version == VERSION_OVIS_IMAGE) { diff --git a/src/model/diffusion/sefi_image.hpp b/src/model/diffusion/sefi_image.hpp new file mode 100644 index 000000000..aceb5baca --- /dev/null +++ b/src/model/diffusion/sefi_image.hpp @@ -0,0 +1,91 @@ +#ifndef __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__ +#define __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__ + +#include + +#include "model/common/block.hpp" + +namespace SefiImage { + struct SefiImageConfig { + int64_t semantic_channels = 16; + int64_t texture_latent_channels = 32; + int64_t timestep_guidance_in_dim = 256; + int64_t hidden_size = 3072; + float timestep_shift_alpha = 0.3f; + float delta_t = 0.1f; + + int64_t packed_texture_channels(int patch_size) const { + return texture_latent_channels * patch_size * patch_size; + } + + int64_t packed_input_channels(int patch_size) const { + return semantic_channels + packed_texture_channels(patch_size); + } + + static SefiImageConfig detect_from_weights(const String2TensorStorage& tensor_storage_map, + const std::string& prefix) { + SefiImageConfig config; + for (const auto& [name, tensor_storage] : tensor_storage_map) { + if (!starts_with(name, prefix)) { + continue; + } + if (ends_with(name, "dual_time_embed.semantic_embedder.linear_1.weight") && tensor_storage.n_dims == 2) { + config.timestep_guidance_in_dim = tensor_storage.ne[0]; + config.hidden_size = tensor_storage.ne[1] * 2; + } + } + LOG_DEBUG("sefi_image: semantic_channels = %" PRId64 ", texture_latent_channels = %" PRId64 ", hidden_size = %" PRId64, + config.semantic_channels, + config.texture_latent_channels, + config.hidden_size); + return config; + } + }; + + struct SefiTimestepEmbedding : public GGMLBlock { + public: + SefiTimestepEmbedding(int64_t in_channels, int64_t time_embed_dim) { + blocks["linear_1"] = std::shared_ptr(new Linear(in_channels, time_embed_dim, false)); + blocks["linear_2"] = std::shared_ptr(new Linear(time_embed_dim, time_embed_dim, false)); + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, ggml_tensor* sample) { + auto linear_1 = std::dynamic_pointer_cast(blocks["linear_1"]); + auto linear_2 = std::dynamic_pointer_cast(blocks["linear_2"]); + + sample = linear_1->forward(ctx, sample); + sample = ggml_silu_inplace(ctx->ggml_ctx, sample); + sample = linear_2->forward(ctx, sample); + return sample; + } + }; + + struct SefiDualTimestepEmbeddings : public GGMLBlock { + public: + SefiDualTimestepEmbeddings(int64_t in_channels, int64_t embedding_dim) { + GGML_ASSERT(embedding_dim % 2 == 0); + int64_t half_dim = embedding_dim / 2; + blocks["semantic_embedder"] = std::make_shared(in_channels, half_dim); + blocks["texture_embedder"] = std::make_shared(in_channels, half_dim); + timestep_guidance_in_dim = in_channels; + } + + ggml_tensor* forward(GGMLRunnerContext* ctx, + ggml_tensor* timestep_sem, + ggml_tensor* timestep_tex) { + auto semantic_embedder = std::dynamic_pointer_cast(blocks["semantic_embedder"]); + auto texture_embedder = std::dynamic_pointer_cast(blocks["texture_embedder"]); + + auto sem_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep_sem, timestep_guidance_in_dim, 10000, 1.f); + auto tex_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep_tex, timestep_guidance_in_dim, 10000, 1.f); + auto sem_emb = semantic_embedder->forward(ctx, sem_proj); + auto tex_emb = texture_embedder->forward(ctx, tex_proj); + return ggml_concat(ctx->ggml_ctx, sem_emb, tex_emb, 0); + } + + private: + int64_t timestep_guidance_in_dim = 256; + }; +} // namespace SefiImage + +#endif // __SD_MODEL_DIFFUSION_SEFI_IMAGE_HPP__ diff --git a/src/model/te/llm.hpp b/src/model/te/llm.hpp index 12daf5637..427baa250 100644 --- a/src/model/te/llm.hpp +++ b/src/model/te/llm.hpp @@ -250,7 +250,7 @@ namespace LLM { config.intermediate_size = tensor_storage.ne[1]; } } - if (arch == LLMArch::QWEN3 && config.num_layers == 28) { + if ((arch == LLMArch::QWEN3 || arch == LLMArch::QWEN3_VL) && config.num_layers == 28) { config.num_heads = 16; } if (detected_vision_layers > 0) { diff --git a/src/model/vae/auto_encoder_kl.hpp b/src/model/vae/auto_encoder_kl.hpp index e41f5fd46..9d56bb88d 100644 --- a/src/model/vae/auto_encoder_kl.hpp +++ b/src/model/vae/auto_encoder_kl.hpp @@ -816,12 +816,15 @@ struct AutoEncoderKL : public VAE { } sd::Tensor diffusion_to_vae_latents(const sd::Tensor& latents) override { + sd::Tensor texture = sd_version_is_sefi_image(version) + ? sd::ops::slice(latents, 2, 16, 144) + : latents; if (sd_version_uses_flux2_vae(version)) { int channel_dim = 2; - auto [mean_tensor, std_tensor] = get_latents_mean_std(latents, channel_dim); - return (latents * std_tensor) / scale_factor + mean_tensor; + auto [mean_tensor, std_tensor] = get_latents_mean_std(texture, channel_dim); + return (texture * std_tensor) / scale_factor + mean_tensor; } - return (latents / scale_factor) + shift_factor; + return (texture / scale_factor) + shift_factor; } sd::Tensor vae_to_diffusion_latents(const sd::Tensor& latents) override { diff --git a/src/model_loader.cpp b/src/model_loader.cpp index 33c056b35..c239e22d2 100644 --- a/src/model_loader.cpp +++ b/src/model_loader.cpp @@ -66,7 +66,6 @@ const char* unused_tensors[] = { // "v_pred", // Used to detect SDXL vpred models "text_encoders.llm.output.weight", "text_encoders.llm.lm_head.", - "first_stage_model.bn.", }; bool is_unused_tensor(const std::string& name) { @@ -480,6 +479,9 @@ SDVersion ModelLoader::get_sd_version() { if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) { is_flux2 = true; } + if (tensor_storage.name.find("dual_time_embed.semantic_embedder.linear_1.weight") != std::string::npos) { + return VERSION_SEFI_IMAGE; + } if (tensor_storage.name.find("single_blocks.47.linear1.weight") != std::string::npos) { has_single_block_47 = true; } diff --git a/src/name_conversion.cpp b/src/name_conversion.cpp index ccc8347b7..3df59ec6f 100644 --- a/src/name_conversion.cpp +++ b/src/name_conversion.cpp @@ -743,7 +743,7 @@ std::string convert_diffusion_model_name(std::string name, std::string prefix, S name = convert_diffusers_unet_to_original_sdxl(name); } else if (sd_version_is_sd3(version)) { name = convert_diffusers_dit_to_original_sd3(name); - } else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version)) { + } else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version) || sd_version_is_sefi_image(version)) { name = convert_diffusers_dit_to_original_flux(name); } else if (sd_version_is_z_image(version)) { name = convert_diffusers_dit_to_original_lumina2(name); diff --git a/src/runtime/denoiser.hpp b/src/runtime/denoiser.hpp index 28b29ef27..e907cf425 100644 --- a/src/runtime/denoiser.hpp +++ b/src/runtime/denoiser.hpp @@ -1006,6 +1006,8 @@ struct FluxFlowDenoiser : public DiscreteFlowDenoiser { } }; +struct SefiFlowDenoiser; + struct Flux2FlowDenoiser : public FluxFlowDenoiser { Flux2FlowDenoiser() = default; @@ -1038,6 +1040,80 @@ struct Flux2FlowDenoiser : public FluxFlowDenoiser { } }; +struct SefiFlowDenoiser : public Flux2FlowDenoiser { + static constexpr int kNumTrainTimesteps = 1000; + static constexpr int kSemChannels = 16; + static constexpr int kTotalChannels = 144; + + float delta_t = 0.1f; + float timestep_shift_alpha = 1.0f; + + std::vector sem_sigmas; + std::vector tex_sigmas; + std::vector sem_timesteps; + std::vector tex_timesteps; + + SefiFlowDenoiser() = default; + + static float apply_alpha_shift(float u_unit, float alpha) { + if (alpha == 1.0f) { + return u_unit; + } + float denom = 1.0f + (alpha - 1.0f) * u_unit; + return (alpha * u_unit) / denom; + } + + std::vector get_sigmas(uint32_t n, + int image_seq_len, + scheduler_t scheduler_type, + SDVersion version, + const char* extra_sample_args = nullptr) override { + sem_sigmas.clear(); + tex_sigmas.clear(); + sem_timesteps.clear(); + tex_timesteps.clear(); + + for (const auto& [key, value] : parse_key_value_args(extra_sample_args, "sefi scheduler arg")) { + if (key == "sefi_alpha") { + if (!parse_strict_float(value, timestep_shift_alpha)) { + LOG_WARN("ignoring invalid sefi scheduler arg '%s=%s'", key.c_str(), value.c_str()); + } + } else if (key == "sefi_delta_t") { + if (!parse_strict_float(value, delta_t)) { + LOG_WARN("ignoring invalid sefi scheduler arg '%s=%s'", key.c_str(), value.c_str()); + } + } + } + + for (uint32_t i = 0; i <= n; ++i) { + float u_base = static_cast(i) / static_cast(n); + float u_shifted = apply_alpha_shift(u_base, timestep_shift_alpha); + float u_sem_raw = u_shifted * (1.0f + delta_t); + + float u_sem = std::min(u_sem_raw, 1.0f); + float u_tex = std::max(0.0f, std::min(u_sem_raw - delta_t, 1.0f)); + + int idx_sem = std::min(kNumTrainTimesteps - 1, + std::max(0, static_cast(u_sem * (kNumTrainTimesteps - 1)))); + int idx_tex = std::min(kNumTrainTimesteps - 1, + std::max(0, static_cast(u_tex * (kNumTrainTimesteps - 1)))); + + float t_sem = static_cast(kNumTrainTimesteps - idx_sem); + float t_tex = static_cast(kNumTrainTimesteps - idx_tex); + float sigma_sem = t_sem / static_cast(kNumTrainTimesteps); + float sigma_tex = t_tex / static_cast(kNumTrainTimesteps); + + sem_timesteps.push_back(t_sem); + tex_timesteps.push_back(t_tex); + sem_sigmas.push_back(sigma_sem); + tex_sigmas.push_back(sigma_tex); + } + LOG_DEBUG("SefiFlowDenoiser: built %u-step dual schedule (alpha=%.2f delta_t=%.2f)", + n, timestep_shift_alpha, delta_t); + return tex_sigmas; + } +}; + typedef std::function&, float, int)> denoise_cb_t; static std::pair get_ancestral_step(float sigma_from, @@ -1141,6 +1217,40 @@ static sd::Tensor sample_euler_ancestral(denoise_cb_t model, return x; } +static sd::Tensor sample_sefi_euler(SefiFlowDenoiser* sefi, + denoise_cb_t model, + sd::Tensor x) { + const std::vector& sigma_tex_vec = sefi->tex_sigmas; + const std::vector& sigma_sem_vec = sefi->sem_sigmas; + int steps = static_cast(sigma_tex_vec.size()) - 1; + for (int i = 0; i < steps; i++) { + float sigma_tex_cur = sigma_tex_vec[i]; + float sigma_tex_next = sigma_tex_vec[i + 1]; + float sigma_sem_cur = sigma_sem_vec[i]; + float sigma_sem_next = sigma_sem_vec[i + 1]; + if (sigma_tex_cur <= 1e-9f) { + continue; + } + auto denoised_opt = model(x, sigma_tex_cur, i + 1); + if (denoised_opt.pred.empty()) { + return {}; + } + sd::Tensor denoised = std::move(denoised_opt.pred); + sd::Tensor velocity = (x - denoised) / sigma_tex_cur; + + auto x_sem = sd::ops::slice(x, 2, 0, SefiFlowDenoiser::kSemChannels); + auto x_tex = sd::ops::slice(x, 2, SefiFlowDenoiser::kSemChannels, SefiFlowDenoiser::kTotalChannels); + auto vel_sem = sd::ops::slice(velocity, 2, 0, SefiFlowDenoiser::kSemChannels); + auto vel_tex = sd::ops::slice(velocity, 2, SefiFlowDenoiser::kSemChannels, SefiFlowDenoiser::kTotalChannels); + auto x_sem_next = x_sem + vel_sem * (sigma_sem_next - sigma_sem_cur); + auto x_tex_next = x_tex + vel_tex * (sigma_tex_next - sigma_tex_cur); + + sd::ops::slice_assign(&x, 2, 0, SefiFlowDenoiser::kSemChannels, x_sem_next); + sd::ops::slice_assign(&x, 2, SefiFlowDenoiser::kSemChannels, SefiFlowDenoiser::kTotalChannels, x_tex_next); + } + return x; +} + static sd::Tensor sample_euler(denoise_cb_t model, sd::Tensor x, const std::vector& sigmas) { @@ -2056,7 +2166,13 @@ static sd::Tensor sample_k_diffusion(sample_method_t method, std::shared_ptr rng, float eta, bool is_flow_denoiser, - const char* extra_sample_args) { + const char* extra_sample_args, + std::shared_ptr denoiser_for_dispatch = nullptr) { + if (denoiser_for_dispatch) { + if (auto sefi = std::dynamic_pointer_cast(denoiser_for_dispatch)) { + return sample_sefi_euler(sefi.get(), model, std::move(x)); + } + } SamplerExtraArgs extra_args = parse_key_value_args(extra_sample_args, "extra sample arg"); switch (method) { case EULER_A_SAMPLE_METHOD: diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp index 311c75113..4925340ba 100644 --- a/src/stable-diffusion.cpp +++ b/src/stable-diffusion.cpp @@ -96,6 +96,7 @@ const char* model_version_to_str[] = { "Longcat-Image", "PiD", "Ideogram 4", + "SeFi-Image", "Krea2", "ESRGAN", }; @@ -691,7 +692,7 @@ class StableDiffusionGGML { version, sd_ctx_params->chroma_use_dit_mask, model_manager); - } else if (sd_version_is_flux2(version)) { + } else if (sd_version_is_flux2(version) || sd_version_is_sefi_image(version)) { bool is_chroma = false; cond_stage_model = std::make_shared(backend_for(SDBackendModule::TE), tensor_storage_map, @@ -1302,6 +1303,8 @@ class StableDiffusionGGML { } else if (sd_version_is_krea2(version)) { default_flow_shift = 1.15f; } + } else if (sd_version_is_sefi_image(version)) { + pred_type = SEFI_FLOW_PRED; } else if (sd_version_is_flux2(version)) { pred_type = FLUX2_FLOW_PRED; } else { @@ -1341,6 +1344,11 @@ class StableDiffusionGGML { denoiser = std::make_shared(); break; } + case SEFI_FLOW_PRED: { + LOG_INFO("running in SeFi-Image dual-time FLOW mode"); + denoiser = std::make_shared(); + break; + } default: { LOG_ERROR("Unknown predition type %i", pred_type); return false; @@ -1646,7 +1654,16 @@ class StableDiffusionGGML { std::vector process_timesteps(const std::vector& timesteps, const sd::Tensor& init_latent, - const sd::Tensor& denoise_mask) { + const sd::Tensor& denoise_mask, + int step) { + if (auto sefi_denoiser = std::dynamic_pointer_cast(denoiser)) { + int sched_idx = step > 0 ? step - 1 : 0; + if (sched_idx >= static_cast(sefi_denoiser->tex_timesteps.size())) { + sched_idx = static_cast(sefi_denoiser->tex_timesteps.size()) - 1; + } + return {sefi_denoiser->sem_timesteps[sched_idx], + sefi_denoiser->tex_timesteps[sched_idx]}; + } if (diffusion_model->get_desc() == "Wan2.2-TI2V-5B") { int64_t frame_count = init_latent.shape()[2]; auto new_timesteps = std::vector(static_cast(frame_count), timesteps[0]); @@ -2058,7 +2075,7 @@ class StableDiffusionGGML { timesteps_vec = process_ltxav_video_timesteps(base_timesteps_vec, init_latent, denoise_mask); audio_timesteps_tensor = sd::Tensor({static_cast(base_timesteps_vec.size())}, base_timesteps_vec); } else { - timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask); + timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask, step); } const std::vector& scaling_timesteps_vec = (sd_version_is_ltxav(version) && !denoise_mask.empty()) ? base_timesteps_vec @@ -2128,7 +2145,7 @@ class StableDiffusionGGML { diffusion_params.extra = UNetDiffusionExtra{-1, &controls, control_strength}; } else if (sd_version_is_sd3(version)) { diffusion_params.extra = SkipLayerDiffusionExtra{local_skip_layers}; - } else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version)) { + } else if (sd_version_is_flux(version) || sd_version_is_flux2(version) || sd_version_is_longcat(version) || sd_version_is_sefi_image(version)) { diffusion_params.extra = FluxDiffusionExtra{&guidance_tensor, local_skip_layers}; } else if (sd_version_is_anima(version)) { @@ -2272,7 +2289,7 @@ class StableDiffusionGGML { return output; }; - auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta, is_flow_denoiser, extra_sample_args); + auto x0_opt = sample_k_diffusion(method, denoise, x_t, sigmas, sampler_rng, eta, is_flow_denoiser, extra_sample_args, denoiser); if (x0_opt.empty()) { LOG_ERROR("Diffusion model sampling failed"); if (control_net) { @@ -2333,6 +2350,8 @@ class StableDiffusionGGML { latent_channel = 3; } else if (sd_version_is_pid(version)) { latent_channel = 3; + } else if (sd_version_is_sefi_image(version)) { + latent_channel = 144; } else if (sd_version_uses_flux2_vae(version)) { latent_channel = 128; } else {