From 1c47e31367b78a770d8ad5452d2e80cab5b51e94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Foucault?= Date: Mon, 9 Jun 2025 12:36:06 +0200 Subject: [PATCH] GPU: Enable GL multithreaded compilation by default This allows to reduce the waiting time caused by shader compilation on some GPU-driver combo. A new settings in the User Preferences make it possible to override the default amount of worker threads and optionally use subprocesses. We still use only one worker thread in cases where there is no benefit with adding more workers (like AMD pro driver and Intel windows). It doesn't scale as much as subprocesses for material shader compilation but that is for other reasons explained in #139818. Add some heuristic to avoid too much memory usage and / or too many stalls. Also add some heuristic to the default number of subprocess for the platform that shows scalling. Historically, multithreaded compilation was prevented by the need of context per thread inside `DRWShader` module. Also there was no good scaling at that time. But nowadays numbers shows different results with good scaling with reasonable amount of threads on many platforms. Even if we are going for vulkan in the next release most of the legacy hardware will still use OpenGL for a few other releases. So it is relevant to make this easy improvement. See pull request for measurements. Pull Request: https://projects.blender.org/blender/blender/pulls/139821 --- release/datafiles/userdef/userdef_default.c | 3 +- scripts/modules/rna_manual_reference.py | 2 +- scripts/startup/bl_ui/space_userpref.py | 7 +- .../blender/blenkernel/BKE_blender_version.h | 2 +- .../blenloader/intern/versioning_userdef.cc | 6 ++ .../draw/engines/eevee/eevee_instance.cc | 9 ++- source/blender/gpu/GPU_capabilities.hh | 2 +- source/blender/gpu/intern/gpu_capabilities.cc | 4 +- .../gpu/intern/gpu_capabilities_private.hh | 2 + source/blender/gpu/intern/gpu_pass.cc | 5 +- .../gpu/intern/gpu_shader_create_info.cc | 12 +--- source/blender/gpu/metal/mtl_shader.mm | 1 - source/blender/gpu/opengl/gl_backend.cc | 65 ++++++++++++++++--- source/blender/gpu/opengl/gl_backend.hh | 7 +- source/blender/gpu/opengl/gl_shader.hh | 4 +- source/blender/makesdna/DNA_userdef_types.h | 17 +++-- .../blender/makesdna/intern/dna_rename_defs.h | 1 + source/blender/makesrna/intern/rna_userdef.cc | 32 +++++++-- 18 files changed, 136 insertions(+), 45 deletions(-) diff --git a/release/datafiles/userdef/userdef_default.c b/release/datafiles/userdef/userdef_default.c index e81d5a32c26..dbf9f37a22e 100644 --- a/release/datafiles/userdef/userdef_default.c +++ b/release/datafiles/userdef/userdef_default.c @@ -116,7 +116,8 @@ const UserDef U_default = { #else .gpu_backend = GPU_BACKEND_OPENGL, #endif - .max_shader_compilation_subprocesses = 0, + .gpu_shader_workers = 0, + .shader_compilation_method = USER_SHADER_COMPILE_THREAD, /** Initialized by: #BKE_studiolight_default. */ .light_param = {{0}}, diff --git a/scripts/modules/rna_manual_reference.py b/scripts/modules/rna_manual_reference.py index 56cec1aa308..e1250f32156 100644 --- a/scripts/modules/rna_manual_reference.py +++ b/scripts/modules/rna_manual_reference.py @@ -30,7 +30,6 @@ url_manual_mapping = ( ("bpy.types.clothcollisionsettings.vertex_group_object_collisions*", "physics/cloth/settings/collisions.html#bpy-types-clothcollisionsettings-vertex-group-object-collisions"), ("bpy.types.gpencilsculptsettings.use_automasking_material_active*", "grease_pencil/modes/sculpting/introduction.html#bpy-types-gpencilsculptsettings-use-automasking-material-active"), ("bpy.types.gpencilsculptsettings.use_automasking_material_stroke*", "grease_pencil/modes/sculpting/introduction.html#bpy-types-gpencilsculptsettings-use-automasking-material-stroke"), - ("bpy.types.preferencessystem.max_shader_compilation_subprocesses*", "editors/preferences/system.html#bpy-types-preferencessystem-max-shader-compilation-subprocesses"), ("bpy.types.cycleslightsettings.use_multiple_importance_sampling*", "render/cycles/light_settings.html#bpy-types-cycleslightsettings-use-multiple-importance-sampling"), ("bpy.types.fluiddomainsettings.sndparticle_potential_max_energy*", "physics/fluid/type/domain/liquid/particles.html#bpy-types-fluiddomainsettings-sndparticle-potential-max-energy"), ("bpy.types.fluiddomainsettings.sndparticle_potential_min_energy*", "physics/fluid/type/domain/liquid/particles.html#bpy-types-fluiddomainsettings-sndparticle-potential-min-energy"), @@ -80,6 +79,7 @@ url_manual_mapping = ( ("bpy.types.preferencesedit.grease_pencil_euclidean_distance*", "editors/preferences/editing.html#bpy-types-preferencesedit-grease-pencil-euclidean-distance"), ("bpy.types.preferencesedit.grease_pencil_manhattan_distance*", "editors/preferences/editing.html#bpy-types-preferencesedit-grease-pencil-manhattan-distance"), ("bpy.types.preferencesinput.mouse_emulate_3_button_modifier*", "editors/preferences/input.html#bpy-types-preferencesinput-mouse-emulate-3-button-modifier"), + ("bpy.types.preferencessystem.max_shader_compilation_workers*", "editors/preferences/system.html#bpy-types-preferencessystem-max-shader-compilation-workers"), ("bpy.types.brushgpencilsettings.use_stroke_random_strength*", "grease_pencil/modes/draw/brushes/draw.html#bpy-types-brushgpencilsettings-use-stroke-random-strength"), ("bpy.types.clothsettings.vertex_group_structural_stiffness*", "physics/cloth/settings/property_weights.html#bpy-types-clothsettings-vertex-group-structural-stiffness"), ("bpy.types.cyclesrendersettings.film_transparent_roughness*", "render/cycles/render_settings/film.html#bpy-types-cyclesrendersettings-film-transparent-roughness"), diff --git a/scripts/startup/bl_ui/space_userpref.py b/scripts/startup/bl_ui/space_userpref.py index 9952ca2498f..71e2e251a28 100644 --- a/scripts/startup/bl_ui/space_userpref.py +++ b/scripts/startup/bl_ui/space_userpref.py @@ -809,8 +809,11 @@ class USERPREF_PT_system_memory(SystemPanel, CenterAlignMixIn, Panel): if sys.platform != "darwin": layout.separator() - col = layout.column() - col.prop(system, "max_shader_compilation_subprocesses") + col = layout.column(align=True) + col.active = system.gpu_backend != 'VULKAN' + col.row().prop(system, "shader_compilation_method", expand=True) + label = "Threads" if system.shader_compilation_method == 'THREAD' else "Subprocesses" + col.prop(system, "gpu_shader_workers", text=label) class USERPREF_PT_system_video_sequencer(SystemPanel, CenterAlignMixIn, Panel): diff --git a/source/blender/blenkernel/BKE_blender_version.h b/source/blender/blenkernel/BKE_blender_version.h index c3c1ece4fa9..9fd1a0c3f44 100644 --- a/source/blender/blenkernel/BKE_blender_version.h +++ b/source/blender/blenkernel/BKE_blender_version.h @@ -27,7 +27,7 @@ /* Blender file format version. */ #define BLENDER_FILE_VERSION BLENDER_VERSION -#define BLENDER_FILE_SUBVERSION 85 +#define BLENDER_FILE_SUBVERSION 86 /* Minimum Blender version that supports reading file written with the current * version. Older Blender versions will test this and cancel loading the file, showing a warning to diff --git a/source/blender/blenloader/intern/versioning_userdef.cc b/source/blender/blenloader/intern/versioning_userdef.cc index afa4b9197b4..ba1d39df4b6 100644 --- a/source/blender/blenloader/intern/versioning_userdef.cc +++ b/source/blender/blenloader/intern/versioning_userdef.cc @@ -1496,6 +1496,12 @@ void blo_do_versions_userdef(UserDef *userdef) } } + if (!USER_VERSION_ATLEAST(405, 86)) { + if (userdef->gpu_shader_workers > 0) { + userdef->shader_compilation_method = USER_SHADER_COMPILE_SUBPROCESS; + } + } + /** * Always bump subversion in BKE_blender_version.h when adding versioning * code here, and wrap it inside a USER_VERSION_ATLEAST check. diff --git a/source/blender/draw/engines/eevee/eevee_instance.cc b/source/blender/draw/engines/eevee/eevee_instance.cc index 509c49ce817..d566e1870e8 100644 --- a/source/blender/draw/engines/eevee/eevee_instance.cc +++ b/source/blender/draw/engines/eevee/eevee_instance.cc @@ -722,11 +722,14 @@ void Instance::draw_viewport() } if (materials.queued_shaders_count > 0) { info_append_i18n("Compiling shaders ({} remaining)", materials.queued_shaders_count); - if (!GPU_use_parallel_compilation() && - GPU_type_matches_ex(GPU_DEVICE_ANY, GPU_OS_ANY, GPU_DRIVER_ANY, GPU_BACKEND_OPENGL)) + if (GPU_backend_get_type() == GPU_BACKEND_OPENGL && !GPU_use_subprocess_compilation() && + /* Only recommend subprocesses when there is known gain. */ + (GPU_type_matches(GPU_DEVICE_NVIDIA, GPU_OS_ANY, GPU_DRIVER_ANY) || + GPU_type_matches(GPU_DEVICE_INTEL, GPU_OS_WIN, GPU_DRIVER_ANY) || + GPU_type_matches(GPU_DEVICE_ATI, GPU_OS_ANY, GPU_DRIVER_OFFICIAL))) { info_append_i18n( - "Increasing Preferences > System > Max Shader Compilation Subprocesses may improve " + "Setting Preferences > System > Shader Compilation Method to Subprocess might improve " "compilation time."); } } diff --git a/source/blender/gpu/GPU_capabilities.hh b/source/blender/gpu/GPU_capabilities.hh index f8a36498127..62bb7ab6af6 100644 --- a/source/blender/gpu/GPU_capabilities.hh +++ b/source/blender/gpu/GPU_capabilities.hh @@ -43,7 +43,7 @@ const char *GPU_extension_get(int i); int GPU_texture_size_with_limit(int res); -bool GPU_use_parallel_compilation(); +bool GPU_use_subprocess_compilation(); int GPU_max_parallel_compilations(); bool GPU_stencil_clasify_buffer_workaround(); diff --git a/source/blender/gpu/intern/gpu_capabilities.cc b/source/blender/gpu/intern/gpu_capabilities.cc index d460d1743da..83e89a85070 100644 --- a/source/blender/gpu/intern/gpu_capabilities.cc +++ b/source/blender/gpu/intern/gpu_capabilities.cc @@ -131,9 +131,9 @@ int GPU_max_samplers() return GCaps.max_samplers; } -bool GPU_use_parallel_compilation() +bool GPU_use_subprocess_compilation() { - return GCaps.max_parallel_compilations > 0; + return GCaps.use_subprocess_shader_compilations; } int GPU_max_parallel_compilations() diff --git a/source/blender/gpu/intern/gpu_capabilities_private.hh b/source/blender/gpu/intern/gpu_capabilities_private.hh index 6d5c887efb2..982c009fb9b 100644 --- a/source/blender/gpu/intern/gpu_capabilities_private.hh +++ b/source/blender/gpu/intern/gpu_capabilities_private.hh @@ -63,6 +63,8 @@ struct GPUCapabilities { bool node_link_instancing_workaround = false; bool line_directive_workaround = false; + bool use_subprocess_shader_compilations = false; + /* Vulkan related workarounds. */ bool render_pass_workaround = false; diff --git a/source/blender/gpu/intern/gpu_pass.cc b/source/blender/gpu/intern/gpu_pass.cc index 7845127833e..f78ab33cc26 100644 --- a/source/blender/gpu/intern/gpu_pass.cc +++ b/source/blender/gpu/intern/gpu_pass.cc @@ -165,9 +165,8 @@ bool GPU_pass_should_optimize(GPUPass *pass) return (GPU_backend_get_type() == GPU_BACKEND_METAL) && pass->should_optimize; #if 0 - /* Returns optimization heuristic prepared during initial codegen. - * NOTE: Optimization limited to parallel compilation as it causes CPU stalls otherwise. */ - return pass->should_optimize && GPU_use_parallel_compilation(); + /* Returns optimization heuristic prepared during initial codegen. */ + return pass->should_optimize; #endif } diff --git a/source/blender/gpu/intern/gpu_shader_create_info.cc b/source/blender/gpu/intern/gpu_shader_create_info.cc index 6e10d388888..0bcd518a7c3 100644 --- a/source/blender/gpu/intern/gpu_shader_create_info.cc +++ b/source/blender/gpu/intern/gpu_shader_create_info.cc @@ -574,16 +574,8 @@ bool gpu_shader_create_info_compile(const char *name_starts_with_filter) } } - Vector result; - if (GPU_use_parallel_compilation() == false) { - for (const GPUShaderCreateInfo *info : infos) { - result.append(GPU_shader_create_from_info(info)); - } - } - else { - BatchHandle batch = GPU_shader_batch_create_from_infos(infos); - result = GPU_shader_batch_finalize(batch); - } + BatchHandle batch = GPU_shader_batch_create_from_infos(infos); + Vector result = GPU_shader_batch_finalize(batch); for (int i : result.index_range()) { const ShaderCreateInfo *info = reinterpret_cast(infos[i]); diff --git a/source/blender/gpu/metal/mtl_shader.mm b/source/blender/gpu/metal/mtl_shader.mm index 5c1bed55b60..78d40b466a8 100644 --- a/source/blender/gpu/metal/mtl_shader.mm +++ b/source/blender/gpu/metal/mtl_shader.mm @@ -1551,7 +1551,6 @@ MTLComputePipelineStateInstance *MTLShader::bake_compute_pipeline_state( MTLShaderCompiler::MTLShaderCompiler() : ShaderCompiler(GPU_max_parallel_compilations(), GPUWorker::ContextType::PerThread, true) { - BLI_assert(GPU_use_parallel_compilation()); } Shader *MTLShaderCompiler::compile_shader(const shader::ShaderCreateInfo &info) diff --git a/source/blender/gpu/opengl/gl_backend.cc b/source/blender/gpu/opengl/gl_backend.cc index c0d6b17d219..e2d7ebbfe94 100644 --- a/source/blender/gpu/opengl/gl_backend.cc +++ b/source/blender/gpu/opengl/gl_backend.cc @@ -723,17 +723,66 @@ void GLBackend::capabilities_init() detect_workarounds(); #if BLI_SUBPROCESS_SUPPORT - if (GCaps.max_parallel_compilations == -1) { - GCaps.max_parallel_compilations = std::min(int(U.max_shader_compilation_subprocesses), - BLI_system_thread_count()); - } + GCaps.use_subprocess_shader_compilations = U.shader_compilation_method == + USER_SHADER_COMPILE_SUBPROCESS; +#else + GCaps.use_subprocess_shader_compilations = false; +#endif if (G.debug & G_DEBUG_GPU_RENDERDOC) { /* Avoid crashes on RenderDoc sessions. */ - GCaps.max_parallel_compilations = 0; + GCaps.use_subprocess_shader_compilations = false; } -#else - GCaps.max_parallel_compilations = 0; -#endif + + int thread_count = U.gpu_shader_workers; + + if (thread_count == 0) { + /* Good default based on measurements. */ + + /* Always have at least 1 worker. */ + thread_count = 1; + + if (GCaps.use_subprocess_shader_compilations) { + /* Use reasonable number of worker by default when there are known gains. */ + if (GPU_type_matches(GPU_DEVICE_NVIDIA, GPU_OS_ANY, GPU_DRIVER_OFFICIAL) || + GPU_type_matches(GPU_DEVICE_ATI, GPU_OS_ANY, GPU_DRIVER_OFFICIAL) || + GPU_type_matches(GPU_DEVICE_INTEL, GPU_OS_WIN, GPU_DRIVER_ANY)) + { + /* Subprocess is too costly in memory (>150MB per worker) to have better defaults. */ + thread_count = std::max(1, std::min(4, BLI_system_thread_count() / 2)); + } + } + else if (GPU_type_matches(GPU_DEVICE_NVIDIA, GPU_OS_ANY, GPU_DRIVER_OFFICIAL)) { + /* Best middle ground between memory usage and speedup as Nvidia context memory footprint + * is quite heavy (~25MB). Moreover we have diminishing return after this because of PSO + * compilation blocking the main thread. + * Can be revisited if we find a way to delete the worker thread context after finishing + * compilation, and fix the scheduling bubbles (#139775). */ + thread_count = 4; + } + else if (GPU_type_matches(GPU_DEVICE_ATI, GPU_OS_ANY, GPU_DRIVER_OPENSOURCE) || + GPU_type_matches(GPU_DEVICE_INTEL, GPU_OS_UNIX, GPU_DRIVER_ANY)) + { + /* Mesa has very good compilation time and doesn't block the main thread. + * The memory footprint of the worker context is rather small (<10MB). + * Shader compilation gets much slower as the number of threads increases. */ + thread_count = 8; + } + else if (GPU_type_matches(GPU_DEVICE_ATI, GPU_OS_ANY, GPU_DRIVER_OFFICIAL)) { + /* AMD proprietary driver's context have huge memory footprint (~45MB). + * There is also not much gain from parallelization. */ + thread_count = 1; + } + else if (GPU_type_matches(GPU_DEVICE_INTEL, GPU_OS_WIN, GPU_DRIVER_ANY)) { + /* Intel windows driver offer almost no speedup with parallel compilation. */ + thread_count = 1; + } + } + + /* Allow thread count override option to limit the number of workers and avoid allocating more + * workers than needed. Also ensures that there is always 1 thread available for the UI. */ + int max_thread_count = std::max(1, BLI_system_thread_count() - 1); + + GCaps.max_parallel_compilations = std::min(thread_count, max_thread_count); /* Disable this feature entirely when not debugging. */ if ((G.debug & G_DEBUG_GPU) == 0) { diff --git a/source/blender/gpu/opengl/gl_backend.hh b/source/blender/gpu/opengl/gl_backend.hh index 2fe45d0fa66..9313d1f87cf 100644 --- a/source/blender/gpu/opengl/gl_backend.hh +++ b/source/blender/gpu/opengl/gl_backend.hh @@ -9,10 +9,15 @@ #pragma once #include "GPU_capabilities.hh" +#include "GPU_platform.hh" + #include "gpu_backend.hh" +#include "BLI_threads.h" #include "BLI_vector.hh" +#include "gpu_capabilities_private.hh" + #ifdef WITH_RENDERDOC # include "renderdoc_api.hh" #endif @@ -56,7 +61,7 @@ class GLBackend : public GPUBackend { void init_resources() override { - if (GPU_use_parallel_compilation()) { + if (GCaps.use_subprocess_shader_compilations) { compiler_ = MEM_new(__func__); } else { diff --git a/source/blender/gpu/opengl/gl_shader.hh b/source/blender/gpu/opengl/gl_shader.hh index 2d1257c2380..1c6de5daa66 100644 --- a/source/blender/gpu/opengl/gl_shader.hh +++ b/source/blender/gpu/opengl/gl_shader.hh @@ -198,8 +198,8 @@ class GLShader : public Shader { class GLShaderCompiler : public ShaderCompiler { public: - GLShaderCompiler(uint32_t threads_count = 1) - : ShaderCompiler(threads_count, GPUWorker::ContextType::PerThread, true){}; + GLShaderCompiler() + : ShaderCompiler(GPU_max_parallel_compilations(), GPUWorker::ContextType::PerThread, true){}; virtual void specialize_shader(ShaderSpecialization &specialization) override; }; diff --git a/source/blender/makesdna/DNA_userdef_types.h b/source/blender/makesdna/DNA_userdef_types.h index 13b1600c4c8..71a2c1b5fcf 100644 --- a/source/blender/makesdna/DNA_userdef_types.h +++ b/source/blender/makesdna/DNA_userdef_types.h @@ -487,13 +487,17 @@ typedef struct UserDef { int gpu_preferred_index; uint32_t gpu_preferred_vendor_id; uint32_t gpu_preferred_device_id; - char _pad16[4]; + + /** Max number of parallel shader compilation workers. */ + short gpu_shader_workers; + /** eUserpref_ShaderCompileMethod (OpenGL only). */ + short shader_compilation_method; + + char _pad16[2]; + /** #eGPUBackendType */ short gpu_backend; - /** Max number of parallel shader compilation subprocesses. */ - short max_shader_compilation_subprocesses; - /** Number of samples for FPS display calculations. */ short playback_fps_samples; @@ -1125,6 +1129,11 @@ typedef enum eUserpref_SeqEditorFlags { USER_SEQ_ED_CONNECT_STRIPS_BY_DEFAULT = (1 << 1), } eUserpref_SeqEditorFlags; +typedef enum eUserpref_ShaderCompileMethod { + USER_SHADER_COMPILE_THREAD = 0, + USER_SHADER_COMPILE_SUBPROCESS = 1, +} eUserpref_ShaderCompileMethod; + /* Locale Ids. Auto will try to get local from OS. Our default is English though. */ /** #UserDef.language */ enum { diff --git a/source/blender/makesdna/intern/dna_rename_defs.h b/source/blender/makesdna/intern/dna_rename_defs.h index 40e0553b2cc..690194786d7 100644 --- a/source/blender/makesdna/intern/dna_rename_defs.h +++ b/source/blender/makesdna/intern/dna_rename_defs.h @@ -215,6 +215,7 @@ DNA_STRUCT_RENAME_MEMBER(UVProjectModifierData, num_projectors, projectors_num) DNA_STRUCT_RENAME_MEMBER(UserDef, autokey_flag, keying_flag) DNA_STRUCT_RENAME_MEMBER(UserDef, gp_manhattendist, gp_manhattandist) DNA_STRUCT_RENAME_MEMBER(UserDef, pythondir, pythondir_legacy) +DNA_STRUCT_RENAME_MEMBER(UserDef, max_shader_compilation_subprocesses, gpu_shader_workers) DNA_STRUCT_RENAME_MEMBER(VFont, name, filepath) DNA_STRUCT_RENAME_MEMBER(View3D, far, clip_end) DNA_STRUCT_RENAME_MEMBER(View3D, local_collections_uuid, local_collections_uid) diff --git a/source/blender/makesrna/intern/rna_userdef.cc b/source/blender/makesrna/intern/rna_userdef.cc index c2543906a96..e7e0aa9927c 100644 --- a/source/blender/makesrna/intern/rna_userdef.cc +++ b/source/blender/makesrna/intern/rna_userdef.cc @@ -6349,14 +6349,36 @@ static void rna_def_userdef_system(BlenderRNA *brna) "Preferred device to select during detection (requires restarting " "Blender for changes to take effect)"); - prop = RNA_def_property(srna, "max_shader_compilation_subprocesses", PROP_INT, PROP_NONE); - RNA_def_property_range(prop, 0, INT16_MAX); + prop = RNA_def_property(srna, "gpu_shader_workers", PROP_INT, PROP_NONE); + RNA_def_property_range(prop, 0, 32); RNA_def_property_ui_text(prop, - "Max Shader Compilation Subprocesses", - "Max number of parallel shader compilation subprocesses, " + "Shader Compilation Workers", + "Number of shader compilation threads or subprocesses, " "clamped at the max threads supported by the CPU " "(requires restarting Blender for changes to take effect). " - "Setting it to 0 disables subprocess shader compilation."); + "A higher number increases the RAM usage while reducing " + "compilation time. A value of 0 will use automatic configuration. " + "(OpenGL only)"); + + static const EnumPropertyItem shader_compilation_method_items[] = { + {USER_SHADER_COMPILE_THREAD, "THREAD", 0, "Thread", "Use threads for compiling shaders"}, + {USER_SHADER_COMPILE_SUBPROCESS, + "SUBPROCESS", + 0, + "Subprocess", + "Use subprocesses for compiling shaders"}, + {0, nullptr, 0, nullptr, nullptr}, + }; + + prop = RNA_def_property(srna, "shader_compilation_method", PROP_ENUM, PROP_NONE); + RNA_def_property_enum_items(prop, shader_compilation_method_items); + RNA_def_property_ui_text(prop, + "Shader Compilation Method", + "Compilation method used for compiling shaders in parallel. " + "Subprocess requires a lot more RAM for each worker " + "but might compile shaders faster on some systems. " + "Requires restarting Blender for changes to take effect. " + "(OpenGL only)"); /* Network. */