Support Marshal.{dump,load} for core Set #13185

jeremyevans · 2025-04-27T02:53:31Z

This was missed when adding core Set, because it's handled implicitly for T_OBJECT.

I attempted to allow core Set to support Marshal.load with stdlib Set marshal output, but it doesn't look like Marshal supports this, raising dump format error (ArgumentError). If the object was previously relying on the default Marshal.dump implementation for T_OBJECT, it doesn't look like you can restore it to a non-T_OBJECT, regardless of how that object implements _load/marshal_load. I think it would require changes to Marshal to allow that.

This was missed when adding core Set, because it's handled implicitly for T_OBJECT. I attempted to allow core Set to support Marshal.load with stdlib Set marshal output, but it doesn't look like Marshal supports this, raising `dump format error (ArgumentError)`. If the object was previously relying on the default Marshal.dump implementation for T_OBJECT, it doesn't look like you can restore it to a non-T_OBJECT, regardless of how that object implements _load/marshal_load. I think it would require changes to Marshal to allow that.

byroot · 2025-04-27T05:49:34Z

Can't we use rb_marshal_define_compat to preserve the format? because otherwise I fear some apps may get broken.

It's generally understood that it's preferable not to exchange Marshal across Ruby versions, but this isn't always respected.

jeremyevans · 2025-04-27T05:55:43Z

Can't we use rb_marshal_define_compat to preserve the format? because otherwise I fear some apps may get broken.

Thanks for pointing this out, I was not aware of it. I'm guessing it will work, I'll try implementing support using it.

It's generally understood that it's preferable not to exchange Marshal across Ruby versions, but this isn't always respected.

Agreed, that's why I had a goal to support it.

nobu · 2025-04-27T08:11:54Z

If you consider the backward compatibility, use _dump and _load instead of marshal_dump and marshal_load.

$ ./ruby -e 'Marshal.dump(Set[1,2,3], STDOUT)' | ruby3.4 -e 'Marshal.load(STDIN)'
<internal:marshal>:34:in 'Marshal.load': instance of Set needs to have method 'marshal_load' (TypeError)
	from -e:1:in '<main>'

This is my patch from 8a0b358.

diff --git i/hash.c w/hash.c
index 98f39c86324..2a2d5c31a45 100644
--- i/hash.c
+++ w/hash.c
@@ -2253,7 +2253,7 @@ rb_hash_default(int argc, VALUE *argv, VALUE hash)
  *  See {Hash Default}[rdoc-ref:Hash@Hash+Default].
  */
 
-static VALUE
+VALUE
 rb_hash_set_default(VALUE hash, VALUE ifnone)
 {
     rb_hash_modify_check(hash);
diff --git i/internal/hash.h w/internal/hash.h
index 676f1404960..03cd830506a 100644
--- i/internal/hash.h
+++ w/internal/hash.h
@@ -72,6 +72,7 @@ struct RHash {
 /* hash.c */
 void rb_hash_st_table_set(VALUE hash, st_table *st);
 VALUE rb_hash_default_value(VALUE hash, VALUE key);
+VALUE rb_hash_set_default(VALUE hash, VALUE ifnone);
 VALUE rb_hash_set_default_proc(VALUE hash, VALUE proc);
 long rb_dbl_long_hash(double d);
 st_table *rb_init_identtable(void);
diff --git i/set.c w/set.c
index 6f11f12e63e..d69b43078fd 100644
--- i/set.c
+++ w/set.c
@@ -99,6 +99,7 @@ VALUE rb_cSet;
 static ID id_each_entry;
 static ID id_any_p;
 static ID id_new;
+static ID id_i_hash;
 static ID id_set_iter_lev;
 
 #define RSET_INITIALIZED FL_USER1
@@ -1851,59 +1852,64 @@ set_i_hash(VALUE set)
 }
 
 /* :nodoc: */
-static VALUE
-set_i_marshal_dump(VALUE set)
+static int
+set_to_hash_i(st_data_t key, st_data_t arg)
 {
-    VALUE array = rb_ary_new_from_args(2, set_i_to_a(set), set_i_compare_by_identity_p(set));
+    rb_hash_aset((VALUE)arg, (VALUE)key, Qtrue);
+    return ST_CONTINUE;
+}
 
-    if (FL_TEST(set, FL_EXIVAR)) {
-	rb_copy_generic_ivar(array, set);
-	FL_SET(array, FL_EXIVAR);
+static VALUE
+set_i_to_h(VALUE set)
+{
+    st_index_t size = RSET_SIZE(set);
+    VALUE hash;
+    if (RSET_COMPARE_BY_IDENTITY(set)) {
+        hash = rb_ident_hash_new_with_size(size);
+    }
+    else {
+        hash = rb_hash_new_with_size(size);
     }
+    rb_hash_set_default(hash, Qfalse);
 
-    return array;
+    if (size == 0) return hash;
+
+    set_iter(set, set_to_hash_i, (st_data_t)hash);
+    return hash;
 }
 
-/* :nodoc: */
 static VALUE
-set_i_marshal_load(VALUE set, VALUE array)
+compat_dumper(VALUE set)
 {
-    rb_check_frozen(set);
-    if (RBASIC(set)->flags & RSET_INITIALIZED) {
-        rb_raise(rb_eArgError, "set is already initialized");
-    }
-    if (!RB_TYPE_P(array, T_ARRAY)) {
-        rb_raise(rb_eArgError, "argument must be array");
-    }
-    if (RARRAY_LEN(array) != 2) {
-        rb_raise(rb_eArgError, "argument must have 2 elements");
-    }
-    VALUE compare_by_identity = RARRAY_AREF(array, 1);
-    array = RARRAY_AREF(array, 0);
-    if (!RB_TYPE_P(array, T_ARRAY)) {
-        rb_raise(rb_eArgError, "first element in argument must be array");
-    }
-    if (compare_by_identity != Qtrue && compare_by_identity != Qfalse) {
-        rb_raise(rb_eArgError, "second element in argument must be true or false");
+    VALUE dumper = rb_class_new_instance(0, 0, rb_cObject);
+    rb_ivar_set(dumper, id_i_hash, set_i_to_h(set));
+    return dumper;
+}
+
+static int
+set_i_from_hash_i(st_data_t key, st_data_t val, st_data_t set)
+{
+    if ((VALUE)val != Qtrue) {
+        rb_raise(rb_eRuntimeError, "expect true as Set value: %"PRIsVALUE, rb_obj_class((VALUE)val));
     }
-    RBASIC(set)->flags |= RSET_INITIALIZED;
-    if (RTEST(compare_by_identity)) set_i_compare_by_identity(set);
-    set_merge_enum_into(set, array);
+    set_i_add((VALUE)set, (VALUE)key);
+    return ST_CONTINUE;
+}
+
+static VALUE
+set_i_from_hash(VALUE set, VALUE hash)
+{
+    Check_Type(hash, T_HASH);
+    if (rb_hash_compare_by_id_p(hash)) set_i_compare_by_identity(set);
+    rb_hash_stlike_foreach(hash, set_i_from_hash_i, (st_data_t)set);
     return set;
 }
 
-/* :nodoc: */
-/*
- * This is to attempt to load marshalled values from stdlib set, but it
- * doesn't work (Marshal does not attempt to call it):
- *
- *     'Marshal.load': dump format error (ArgumentError)
- *
 static VALUE
-set_s__load(VALUE klass, VALUE str)
+compat_loader(VALUE self, VALUE a)
 {
+    return set_i_from_hash(self, rb_ivar_get(a, id_i_hash));
 }
-*/
 
 /*
  *  Document-class: Set
@@ -2123,11 +2129,11 @@ Init_Set(void)
     id_each_entry = rb_intern_const("each_entry");
     id_any_p = rb_intern_const("any?");
     id_new = rb_intern_const("new");
+    id_i_hash = rb_intern_const("@hash");
     id_set_iter_lev = rb_make_internal_id();
 
     rb_define_alloc_func(rb_cSet, set_s_alloc);
     rb_define_singleton_method(rb_cSet, "[]", set_s_create, -1);
-    // rb_define_singleton_method(rb_cSet, "_load", set_s__load, 1);
 
     rb_define_method(rb_cSet, "initialize", set_i_initialize, -1);
     rb_define_method(rb_cSet, "initialize_copy", set_i_initialize_copy, 1);
@@ -2170,8 +2176,6 @@ Init_Set(void)
     rb_define_method(rb_cSet, "intersect?", set_i_intersect, 1);
     rb_define_method(rb_cSet, "join", set_i_join, -1);
     rb_define_method(rb_cSet, "keep_if", set_i_keep_if, 0);
-    rb_define_method(rb_cSet, "marshal_dump", set_i_marshal_dump, 0);
-    rb_define_method(rb_cSet, "marshal_load", set_i_marshal_load, 1);
     rb_define_method(rb_cSet, "merge", set_i_merge, -1);
     rb_define_method(rb_cSet, "proper_subset?", set_i_proper_subset, 1);
     rb_define_alias(rb_cSet, "<", "proper_subset?");
@@ -2190,7 +2194,13 @@ Init_Set(void)
     rb_define_method(rb_cSet, "superset?", set_i_superset, 1);
     rb_define_alias(rb_cSet, ">=", "superset?");
     rb_define_method(rb_cSet, "to_a", set_i_to_a, 0);
+    rb_define_method(rb_cSet, "to_h", set_i_to_h, 0);
     rb_define_method(rb_cSet, "to_set", set_i_to_set, -1);
+    // rb_define_singleton_method(rb_cSet, "from_hash", compat_loader, 1);
+
+    /* :nodoc: */
+    VALUE compat = rb_define_class_under(rb_cSet, "compatible", rb_cObject);
+    rb_marshal_define_compat(rb_cSet, compat, compat_dumper, compat_loader);
 
     rb_provide("set.rb");
 }
diff --git i/test/ruby/test_set.rb w/test/ruby/test_set.rb
index f2b5fa16226..8446b14a1af 100644
--- i/test/ruby/test_set.rb
+++ w/test/ruby/test_set.rb
@@ -23,10 +23,8 @@
     assert_equal(set.compare_by_identity?, mset.compare_by_identity?)
     assert_equal(1, mset.instance_variable_get(:@a))
 
-    # Not sure if possible to support?
-    # 'Marshal.load': dump format error (ArgumentError)
-    # old_stdlib_set_data = "\x04\bo:\bSet\x06:\n@hash}\bi\x06Ti\aTi\bTF".b
-    # assert_equal(Set[1, 2, 3], Marshal.load(old_stdlib_set_data))
+    old_stdlib_set_data = "\x04\bo:\bSet\x06:\n@hash}\bi\x06Ti\aTi\bTF".b
+    assert_equal(Set[1, 2, 3], Marshal.load(old_stdlib_set_data))
   end
 
   def test_aref

Co-authored-by: Nobuyoshi Nakada <nobu@ruby-lang.org>

jeremyevans · 2025-04-27T09:06:12Z

@nobu Thank you. I don't have a problem with making it backwards compatible both ways. Doing so bloats the marshal string for larger sets, but I guess that isn't a major problem. Your approach also seems more elegant than mine. I've replaced my previous commit with yours, but kept the extra specs I added.

byroot · 2025-04-27T09:08:28Z

but I guess that isn't a major problem.

Compatibility definitely trumps a slightly smaller payload size.

byroot · 2025-04-27T09:50:00Z

set.c

    rb_define_method(rb_cSet, "to_set", set_i_to_set, -1);
+    // rb_define_singleton_method(rb_cSet, "from_hash", compat_loader, 1);


Is there a purpose to this commented out method?

I assume that leaving it commented out makes it easier to test manually. However, it seems like it should be a regular method and not a singleton method, as compat_loader calls set_i_to_hash, which accepts a Set instance and not a class. @nobu is that correct?

Yes, it was for just debugging but a mistake.
Please remove it.

Pushed a commit to remove it. I'll be squashing these commits when merging.

byroot · 2025-04-30T07:01:08Z

I just noticed thanks to one of my gems test suite that we have a similar problem with Psych:

   1) Failure:
SafeYAMLTest#test_#dump_accepts_permitted_classes [test/vanilla/safe_yaml_test.rb:49]:
Expected: #<Set: {1, 2}>
  Actual: #<Set: {}>

  2) Failure:
SafeYAMLTest#test_#load_accepts_permitted_classes [test/vanilla/safe_yaml_test.rb:21]:
Expected: #<Set: {1, 2}>
  Actual: #<Set: {}>

jeremyevans requested a review from byroot April 27, 2025 02:53

jeremyevans mentioned this pull request Apr 27, 2025

Implement Set as a core class #13074

Merged

Support restoring marshalled data for stdlib set

b41f62f

Co-authored-by: Nobuyoshi Nakada <nobu@ruby-lang.org>

jeremyevans force-pushed the set-marshal branch from 2e61054 to b41f62f Compare April 27, 2025 09:05

This comment has been minimized.

Sign in to view

byroot reviewed Apr 27, 2025

View reviewed changes

Remove commented out method

4d085f1

byroot approved these changes Apr 28, 2025

View reviewed changes

jeremyevans merged commit 9264111 into ruby:master Apr 28, 2025
81 of 85 checks passed

MSP-Greg mentioned this pull request Apr 28, 2025

ci: Add Ubuntu and Windows arm jobs puma/puma#3651

Merged

7 tasks

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Support Marshal.{dump,load} for core Set #13185

Support Marshal.{dump,load} for core Set #13185

Uh oh!

jeremyevans commented Apr 27, 2025

Uh oh!

byroot commented Apr 27, 2025

Uh oh!

jeremyevans commented Apr 27, 2025

Uh oh!

nobu commented Apr 27, 2025

Uh oh!

jeremyevans commented Apr 27, 2025

Uh oh!

byroot commented Apr 27, 2025

Uh oh!

This comment has been minimized.

byroot Apr 27, 2025

Uh oh!

jeremyevans Apr 27, 2025

Uh oh!

nobu Apr 28, 2025

Uh oh!

jeremyevans Apr 28, 2025

Uh oh!

Uh oh!

byroot commented Apr 30, 2025

Uh oh!

Uh oh!

		rb_define_method(rb_cSet, "to_set", set_i_to_set, -1);
		// rb_define_singleton_method(rb_cSet, "from_hash", compat_loader, 1);

Support Marshal.{dump,load} for core Set #13185

Support Marshal.{dump,load} for core Set #13185

Uh oh!

Conversation

jeremyevans commented Apr 27, 2025

Uh oh!

byroot commented Apr 27, 2025

Uh oh!

jeremyevans commented Apr 27, 2025

Uh oh!

nobu commented Apr 27, 2025

Uh oh!

jeremyevans commented Apr 27, 2025

Uh oh!

byroot commented Apr 27, 2025

Uh oh!

This comment has been minimized.

byroot Apr 27, 2025

Choose a reason for hiding this comment

Uh oh!

jeremyevans Apr 27, 2025

Choose a reason for hiding this comment

Uh oh!

nobu Apr 28, 2025

Choose a reason for hiding this comment

Uh oh!

jeremyevans Apr 28, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

byroot commented Apr 30, 2025

Uh oh!

Uh oh!