GPU: Enable GL multithreaded compilation by default

This allows to reduce the waiting time caused by shader compilation on some GPU-driver combo. A new settings in the User Preferences make it possible to override the default amount of worker threads and optionally use subprocesses. We still use only one worker thread in cases where there is no benefit with adding more workers (like AMD pro driver and Intel windows). It doesn't scale as much as subprocesses for material shader compilation but that is for other reasons explained in #139818. Add some heuristic to avoid too much memory usage and / or too many stalls. Also add some heuristic to the default number of subprocess for the platform that shows scalling. Historically, multithreaded compilation was prevented by the need of context per thread inside `DRWShader` module. Also there was no good scaling at that time. But nowadays numbers shows different results with good scaling with reasonable amount of threads on many platforms. Even if we are going for vulkan in the next release most of the legacy hardware will still use OpenGL for a few other releases. So it is relevant to make this easy improvement. See pull request for measurements. Pull Request: https://projects.blender.org/blender/blender/pulls/139821
2025-06-09 12:36:06 +02:00 · 2025-06-09 12:36:06 +02:00 · 1c47e31367
commit 1c47e31367
parent b58c5f6e77
18 changed files with 136 additions and 45 deletions
--- a/release/datafiles/userdef/userdef_default.c
+++ b/release/datafiles/userdef/userdef_default.c
@ -116,7 +116,8 @@ const UserDef U_default = {
 #else
    .gpu_backend = GPU_BACKEND_OPENGL,
 #endif
-    .max_shader_compilation_subprocesses = 0,
+    .gpu_shader_workers = 0,
+    .shader_compilation_method = USER_SHADER_COMPILE_THREAD,

    /** Initialized by: #BKE_studiolight_default. */
    .light_param = {{0}},
--- a/scripts/modules/rna_manual_reference.py
+++ b/scripts/modules/rna_manual_reference.py
@ -30,7 +30,6 @@ url_manual_mapping = (
    ("bpy.types.clothcollisionsettings.vertex_group_object_collisions*", "physics/cloth/settings/collisions.html#bpy-types-clothcollisionsettings-vertex-group-object-collisions"),
    ("bpy.types.gpencilsculptsettings.use_automasking_material_active*", "grease_pencil/modes/sculpting/introduction.html#bpy-types-gpencilsculptsettings-use-automasking-material-active"),
    ("bpy.types.gpencilsculptsettings.use_automasking_material_stroke*", "grease_pencil/modes/sculpting/introduction.html#bpy-types-gpencilsculptsettings-use-automasking-material-stroke"),
-    ("bpy.types.preferencessystem.max_shader_compilation_subprocesses*", "editors/preferences/system.html#bpy-types-preferencessystem-max-shader-compilation-subprocesses"),
    ("bpy.types.cycleslightsettings.use_multiple_importance_sampling*", "render/cycles/light_settings.html#bpy-types-cycleslightsettings-use-multiple-importance-sampling"),
    ("bpy.types.fluiddomainsettings.sndparticle_potential_max_energy*", "physics/fluid/type/domain/liquid/particles.html#bpy-types-fluiddomainsettings-sndparticle-potential-max-energy"),
    ("bpy.types.fluiddomainsettings.sndparticle_potential_min_energy*", "physics/fluid/type/domain/liquid/particles.html#bpy-types-fluiddomainsettings-sndparticle-potential-min-energy"),
@ -80,6 +79,7 @@ url_manual_mapping = (
    ("bpy.types.preferencesedit.grease_pencil_euclidean_distance*", "editors/preferences/editing.html#bpy-types-preferencesedit-grease-pencil-euclidean-distance"),
    ("bpy.types.preferencesedit.grease_pencil_manhattan_distance*", "editors/preferences/editing.html#bpy-types-preferencesedit-grease-pencil-manhattan-distance"),
    ("bpy.types.preferencesinput.mouse_emulate_3_button_modifier*", "editors/preferences/input.html#bpy-types-preferencesinput-mouse-emulate-3-button-modifier"),
+    ("bpy.types.preferencessystem.max_shader_compilation_workers*", "editors/preferences/system.html#bpy-types-preferencessystem-max-shader-compilation-workers"),
    ("bpy.types.brushgpencilsettings.use_stroke_random_strength*", "grease_pencil/modes/draw/brushes/draw.html#bpy-types-brushgpencilsettings-use-stroke-random-strength"),
    ("bpy.types.clothsettings.vertex_group_structural_stiffness*", "physics/cloth/settings/property_weights.html#bpy-types-clothsettings-vertex-group-structural-stiffness"),
    ("bpy.types.cyclesrendersettings.film_transparent_roughness*", "render/cycles/render_settings/film.html#bpy-types-cyclesrendersettings-film-transparent-roughness"),
--- a/scripts/startup/bl_ui/space_userpref.py
+++ b/scripts/startup/bl_ui/space_userpref.py
@ -809,8 +809,11 @@ class USERPREF_PT_system_memory(SystemPanel, CenterAlignMixIn, Panel):

        if sys.platform != "darwin":
            layout.separator()
-            col = layout.column()
-            col.prop(system, "max_shader_compilation_subprocesses")
+            col = layout.column(align=True)
+            col.active = system.gpu_backend != 'VULKAN'
+            col.row().prop(system, "shader_compilation_method", expand=True)
+            label = "Threads" if system.shader_compilation_method == 'THREAD' else "Subprocesses"
+            col.prop(system, "gpu_shader_workers", text=label)


 class USERPREF_PT_system_video_sequencer(SystemPanel, CenterAlignMixIn, Panel):
--- a/source/blender/blenkernel/BKE_blender_version.h
+++ b/source/blender/blenkernel/BKE_blender_version.h
@ -27,7 +27,7 @@

 /* Blender file format version. */
 #define BLENDER_FILE_VERSION BLENDER_VERSION
-#define BLENDER_FILE_SUBVERSION 85
+#define BLENDER_FILE_SUBVERSION 86

 /* Minimum Blender version that supports reading file written with the current
 * version. Older Blender versions will test this and cancel loading the file, showing a warning to
--- a/source/blender/blenloader/intern/versioning_userdef.cc
+++ b/source/blender/blenloader/intern/versioning_userdef.cc
@ -1496,6 +1496,12 @@ void blo_do_versions_userdef(UserDef *userdef)
    }
  }

+  if (!USER_VERSION_ATLEAST(405, 86)) {
+    if (userdef->gpu_shader_workers > 0) {
+      userdef->shader_compilation_method = USER_SHADER_COMPILE_SUBPROCESS;
+    }
+  }
+
  /**
   * Always bump subversion in BKE_blender_version.h when adding versioning
   * code here, and wrap it inside a USER_VERSION_ATLEAST check.
--- a/source/blender/draw/engines/eevee/eevee_instance.cc
+++ b/source/blender/draw/engines/eevee/eevee_instance.cc
@ -722,11 +722,14 @@ void Instance::draw_viewport()
    }
    if (materials.queued_shaders_count > 0) {
      info_append_i18n("Compiling shaders ({} remaining)", materials.queued_shaders_count);
-      if (!GPU_use_parallel_compilation() &&
-          GPU_type_matches_ex(GPU_DEVICE_ANY, GPU_OS_ANY, GPU_DRIVER_ANY, GPU_BACKEND_OPENGL))
+      if (GPU_backend_get_type() == GPU_BACKEND_OPENGL && !GPU_use_subprocess_compilation() &&
+          /* Only recommend subprocesses when there is known gain. */
+          (GPU_type_matches(GPU_DEVICE_NVIDIA, GPU_OS_ANY, GPU_DRIVER_ANY) ||
+           GPU_type_matches(GPU_DEVICE_INTEL, GPU_OS_WIN, GPU_DRIVER_ANY) ||
+           GPU_type_matches(GPU_DEVICE_ATI, GPU_OS_ANY, GPU_DRIVER_OFFICIAL)))
      {
        info_append_i18n(
-            "Increasing Preferences > System > Max Shader Compilation Subprocesses may improve "
+            "Setting Preferences > System > Shader Compilation Method to Subprocess might improve "
            "compilation time.");
      }
    }
--- a/source/blender/gpu/GPU_capabilities.hh
+++ b/source/blender/gpu/GPU_capabilities.hh
@ -43,7 +43,7 @@ const char *GPU_extension_get(int i);

 int GPU_texture_size_with_limit(int res);

-bool GPU_use_parallel_compilation();
+bool GPU_use_subprocess_compilation();
 int GPU_max_parallel_compilations();

 bool GPU_stencil_clasify_buffer_workaround();
--- a/source/blender/gpu/intern/gpu_capabilities.cc
+++ b/source/blender/gpu/intern/gpu_capabilities.cc
@ -131,9 +131,9 @@ int GPU_max_samplers()
  return GCaps.max_samplers;
 }

-bool GPU_use_parallel_compilation()
+bool GPU_use_subprocess_compilation()
 {
-  return GCaps.max_parallel_compilations > 0;
+  return GCaps.use_subprocess_shader_compilations;
 }

 int GPU_max_parallel_compilations()
--- a/source/blender/gpu/intern/gpu_capabilities_private.hh
+++ b/source/blender/gpu/intern/gpu_capabilities_private.hh
@ -63,6 +63,8 @@ struct GPUCapabilities {
  bool node_link_instancing_workaround = false;
  bool line_directive_workaround = false;

+  bool use_subprocess_shader_compilations = false;
+
  /* Vulkan related workarounds. */
  bool render_pass_workaround = false;

--- a/source/blender/gpu/intern/gpu_pass.cc
+++ b/source/blender/gpu/intern/gpu_pass.cc
@ -165,9 +165,8 @@ bool GPU_pass_should_optimize(GPUPass *pass)
  return (GPU_backend_get_type() == GPU_BACKEND_METAL) && pass->should_optimize;

 #if 0
-  /* Returns optimization heuristic prepared during initial codegen.
-   * NOTE: Optimization limited to parallel compilation as it causes CPU stalls otherwise. */
-  return pass->should_optimize && GPU_use_parallel_compilation();
+  /* Returns optimization heuristic prepared during initial codegen. */
+  return pass->should_optimize;
 #endif
 }

--- a/source/blender/gpu/intern/gpu_shader_create_info.cc
+++ b/source/blender/gpu/intern/gpu_shader_create_info.cc
@ -574,16 +574,8 @@ bool gpu_shader_create_info_compile(const char *name_starts_with_filter)
    }
  }

-  Vector<GPUShader *> result;
-  if (GPU_use_parallel_compilation() == false) {
-    for (const GPUShaderCreateInfo *info : infos) {
-      result.append(GPU_shader_create_from_info(info));
-    }
-  }
-  else {
-    BatchHandle batch = GPU_shader_batch_create_from_infos(infos);
-    result = GPU_shader_batch_finalize(batch);
-  }
+  BatchHandle batch = GPU_shader_batch_create_from_infos(infos);
+  Vector<GPUShader *> result = GPU_shader_batch_finalize(batch);

  for (int i : result.index_range()) {
    const ShaderCreateInfo *info = reinterpret_cast<const ShaderCreateInfo *>(infos[i]);
--- a/source/blender/gpu/metal/mtl_shader.mm
+++ b/source/blender/gpu/metal/mtl_shader.mm
@ -1551,7 +1551,6 @@ MTLComputePipelineStateInstance *MTLShader::bake_compute_pipeline_state(
 MTLShaderCompiler::MTLShaderCompiler()
    : ShaderCompiler(GPU_max_parallel_compilations(), GPUWorker::ContextType::PerThread, true)
 {
-  BLI_assert(GPU_use_parallel_compilation());
 }

 Shader *MTLShaderCompiler::compile_shader(const shader::ShaderCreateInfo &info)
--- a/source/blender/gpu/opengl/gl_backend.cc
+++ b/source/blender/gpu/opengl/gl_backend.cc
@ -723,17 +723,66 @@ void GLBackend::capabilities_init()
  detect_workarounds();

 #if BLI_SUBPROCESS_SUPPORT
-  if (GCaps.max_parallel_compilations == -1) {
-    GCaps.max_parallel_compilations = std::min(int(U.max_shader_compilation_subprocesses),
-                                               BLI_system_thread_count());
-  }
+  GCaps.use_subprocess_shader_compilations = U.shader_compilation_method ==
+                                             USER_SHADER_COMPILE_SUBPROCESS;
+#else
+  GCaps.use_subprocess_shader_compilations = false;
+#endif
  if (G.debug & G_DEBUG_GPU_RENDERDOC) {
    /* Avoid crashes on RenderDoc sessions. */
-    GCaps.max_parallel_compilations = 0;
+    GCaps.use_subprocess_shader_compilations = false;
  }
-#else
-  GCaps.max_parallel_compilations = 0;
-#endif
+
+  int thread_count = U.gpu_shader_workers;
+
+  if (thread_count == 0) {
+    /* Good default based on measurements. */
+
+    /* Always have at least 1 worker. */
+    thread_count = 1;
+
+    if (GCaps.use_subprocess_shader_compilations) {
+      /* Use reasonable number of worker by default when there are known gains. */
+      if (GPU_type_matches(GPU_DEVICE_NVIDIA, GPU_OS_ANY, GPU_DRIVER_OFFICIAL) ||
+          GPU_type_matches(GPU_DEVICE_ATI, GPU_OS_ANY, GPU_DRIVER_OFFICIAL) ||
+          GPU_type_matches(GPU_DEVICE_INTEL, GPU_OS_WIN, GPU_DRIVER_ANY))
+      {
+        /* Subprocess is too costly in memory (>150MB per worker) to have better defaults. */
+        thread_count = std::max(1, std::min(4, BLI_system_thread_count() / 2));
+      }
+    }
+    else if (GPU_type_matches(GPU_DEVICE_NVIDIA, GPU_OS_ANY, GPU_DRIVER_OFFICIAL)) {
+      /* Best middle ground between memory usage and speedup as Nvidia context memory footprint
+       * is quite heavy (~25MB). Moreover we have diminishing return after this because of PSO
+       * compilation blocking the main thread.
+       * Can be revisited if we find a way to delete the worker thread context after finishing
+       * compilation, and fix the scheduling bubbles (#139775). */
+      thread_count = 4;
+    }
+    else if (GPU_type_matches(GPU_DEVICE_ATI, GPU_OS_ANY, GPU_DRIVER_OPENSOURCE) ||
+             GPU_type_matches(GPU_DEVICE_INTEL, GPU_OS_UNIX, GPU_DRIVER_ANY))
+    {
+      /* Mesa has very good compilation time and doesn't block the main thread.
+       * The memory footprint of the worker context is rather small (<10MB).
+       * Shader compilation gets much slower as the number of threads increases. */
+      thread_count = 8;
+    }
+    else if (GPU_type_matches(GPU_DEVICE_ATI, GPU_OS_ANY, GPU_DRIVER_OFFICIAL)) {
+      /* AMD proprietary driver's context have huge memory footprint (~45MB).
+       * There is also not much gain from parallelization. */
+      thread_count = 1;
+    }
+    else if (GPU_type_matches(GPU_DEVICE_INTEL, GPU_OS_WIN, GPU_DRIVER_ANY)) {
+      /* Intel windows driver offer almost no speedup with parallel compilation. */
+      thread_count = 1;
+    }
+  }
+
+  /* Allow thread count override option to limit the number of workers and avoid allocating more
+   * workers than needed. Also ensures that there is always 1 thread available for the UI. */
+  int max_thread_count = std::max(1, BLI_system_thread_count() - 1);
+
+  GCaps.max_parallel_compilations = std::min(thread_count, max_thread_count);

  /* Disable this feature entirely when not debugging. */
  if ((G.debug & G_DEBUG_GPU) == 0) {
--- a/source/blender/gpu/opengl/gl_backend.hh
+++ b/source/blender/gpu/opengl/gl_backend.hh
@ -9,10 +9,15 @@
 #pragma once

 #include "GPU_capabilities.hh"
+#include "GPU_platform.hh"
+
 #include "gpu_backend.hh"

+#include "BLI_threads.h"
 #include "BLI_vector.hh"

+#include "gpu_capabilities_private.hh"
+
 #ifdef WITH_RENDERDOC
 #  include "renderdoc_api.hh"
 #endif
@ -56,7 +61,7 @@ class GLBackend : public GPUBackend {

  void init_resources() override
  {
-    if (GPU_use_parallel_compilation()) {
+    if (GCaps.use_subprocess_shader_compilations) {
      compiler_ = MEM_new<GLSubprocessShaderCompiler>(__func__);
    }
    else {
--- a/source/blender/gpu/opengl/gl_shader.hh
+++ b/source/blender/gpu/opengl/gl_shader.hh
@ -198,8 +198,8 @@ class GLShader : public Shader {

 class GLShaderCompiler : public ShaderCompiler {
 public:
-  GLShaderCompiler(uint32_t threads_count = 1)
-      : ShaderCompiler(threads_count, GPUWorker::ContextType::PerThread, true){};
+  GLShaderCompiler()
+      : ShaderCompiler(GPU_max_parallel_compilations(), GPUWorker::ContextType::PerThread, true){};

  virtual void specialize_shader(ShaderSpecialization &specialization) override;
 };
--- a/source/blender/makesdna/DNA_userdef_types.h
+++ b/source/blender/makesdna/DNA_userdef_types.h
@ -487,13 +487,17 @@ typedef struct UserDef {
  int gpu_preferred_index;
  uint32_t gpu_preferred_vendor_id;
  uint32_t gpu_preferred_device_id;
-  char _pad16[4];
+
+  /** Max number of parallel shader compilation workers. */
+  short gpu_shader_workers;
+  /** eUserpref_ShaderCompileMethod (OpenGL only). */
+  short shader_compilation_method;
+
+  char _pad16[2];
+
  /** #eGPUBackendType */
  short gpu_backend;

-  /** Max number of parallel shader compilation subprocesses. */
-  short max_shader_compilation_subprocesses;
-
  /** Number of samples for FPS display calculations. */
  short playback_fps_samples;

@ -1125,6 +1129,11 @@ typedef enum eUserpref_SeqEditorFlags {
  USER_SEQ_ED_CONNECT_STRIPS_BY_DEFAULT = (1 << 1),
 } eUserpref_SeqEditorFlags;

+typedef enum eUserpref_ShaderCompileMethod {
+  USER_SHADER_COMPILE_THREAD = 0,
+  USER_SHADER_COMPILE_SUBPROCESS = 1,
+} eUserpref_ShaderCompileMethod;
+
 /* Locale Ids. Auto will try to get local from OS. Our default is English though. */
 /** #UserDef.language */
 enum {
--- a/source/blender/makesdna/intern/dna_rename_defs.h
+++ b/source/blender/makesdna/intern/dna_rename_defs.h
@ -215,6 +215,7 @@ DNA_STRUCT_RENAME_MEMBER(UVProjectModifierData, num_projectors, projectors_num)
 DNA_STRUCT_RENAME_MEMBER(UserDef, autokey_flag, keying_flag)
 DNA_STRUCT_RENAME_MEMBER(UserDef, gp_manhattendist, gp_manhattandist)
 DNA_STRUCT_RENAME_MEMBER(UserDef, pythondir, pythondir_legacy)
+DNA_STRUCT_RENAME_MEMBER(UserDef, max_shader_compilation_subprocesses, gpu_shader_workers)
 DNA_STRUCT_RENAME_MEMBER(VFont, name, filepath)
 DNA_STRUCT_RENAME_MEMBER(View3D, far, clip_end)
 DNA_STRUCT_RENAME_MEMBER(View3D, local_collections_uuid, local_collections_uid)
--- a/source/blender/makesrna/intern/rna_userdef.cc
+++ b/source/blender/makesrna/intern/rna_userdef.cc
@ -6349,14 +6349,36 @@ static void rna_def_userdef_system(BlenderRNA *brna)
                           "Preferred device to select during detection (requires restarting "
                           "Blender for changes to take effect)");

-  prop = RNA_def_property(srna, "max_shader_compilation_subprocesses", PROP_INT, PROP_NONE);
-  RNA_def_property_range(prop, 0, INT16_MAX);
+  prop = RNA_def_property(srna, "gpu_shader_workers", PROP_INT, PROP_NONE);
+  RNA_def_property_range(prop, 0, 32);
  RNA_def_property_ui_text(prop,
-                           "Max Shader Compilation Subprocesses",
-                           "Max number of parallel shader compilation subprocesses, "
+                           "Shader Compilation Workers",
+                           "Number of shader compilation threads or subprocesses, "
                           "clamped at the max threads supported by the CPU "
                           "(requires restarting Blender for changes to take effect). "
-                           "Setting it to 0 disables subprocess shader compilation.");
+                           "A higher number increases the RAM usage while reducing "
+                           "compilation time. A value of 0 will use automatic configuration. "
+                           "(OpenGL only)");
+
+  static const EnumPropertyItem shader_compilation_method_items[] = {
+      {USER_SHADER_COMPILE_THREAD, "THREAD", 0, "Thread", "Use threads for compiling shaders"},
+      {USER_SHADER_COMPILE_SUBPROCESS,
+       "SUBPROCESS",
+       0,
+       "Subprocess",
+       "Use subprocesses for compiling shaders"},
+      {0, nullptr, 0, nullptr, nullptr},
+  };
+
+  prop = RNA_def_property(srna, "shader_compilation_method", PROP_ENUM, PROP_NONE);
+  RNA_def_property_enum_items(prop, shader_compilation_method_items);
+  RNA_def_property_ui_text(prop,
+                           "Shader Compilation Method",
+                           "Compilation method used for compiling shaders in parallel. "
+                           "Subprocess requires a lot more RAM for each worker "
+                           "but might compile shaders faster on some systems. "
+                           "Requires restarting Blender for changes to take effect. "
+                           "(OpenGL only)");

  /* Network. */