Add some encoding debugging to make testing easier

kddnewton · kddnewton · commit 0c042561c681 · 2024-02-23T14:48:01.000-05:00
diff --git a/ext/prism/extension.c b/ext/prism/extension.c
@@ -21,6 +21,8 @@ VALUE rb_cPrismParseError;
 VALUE rb_cPrismParseWarning;
 VALUE rb_cPrismParseResult;
 
+VALUE rb_cPrismDebugEncoding;
+
 ID rb_option_id_filepath;
 ID rb_option_id_encoding;
 ID rb_option_id_line;
@@ -1102,6 +1104,80 @@ format_errors(VALUE self, VALUE source, VALUE colorize) {
     return result;
 }
 
+/**
+ * call-seq: Debug::Encoding.all -> Array[Debug::Encoding]
+ *
+ * Return an array of all of the encodings that prism knows about.
+ */
+static VALUE
+encoding_all(VALUE self) {
+    VALUE encodings = rb_ary_new();
+
+    for (size_t index = 0; index < PM_ENCODING_MAXIMUM; index++) {
+        const pm_encoding_t *encoding = &pm_encodings[index];
+
+        VALUE encoding_argv[] = { rb_str_new_cstr(encoding->name), encoding->multibyte ? Qtrue : Qfalse };
+        rb_ary_push(encodings, rb_class_new_instance(2, encoding_argv, rb_cPrismDebugEncoding));
+    }
+
+    return encodings;
+}
+
+static const pm_encoding_t *
+encoding_find(VALUE name) {
+    const uint8_t *source = (const uint8_t *) RSTRING_PTR(name);
+    size_t length = RSTRING_LEN(name);
+
+    const pm_encoding_t *encoding = pm_encoding_find(source, source + length);
+    if (encoding == NULL) { rb_raise(rb_eArgError, "Unknown encoding: %s", source); }
+
+    return encoding;
+}
+
+/**
+ * call-seq: Debug::Encoding.width(source) -> Integer
+ *
+ * Returns the width of the first character in the given string if it is valid
+ * in the encoding. If it is not, this function returns 0.
+ */
+static VALUE
+encoding_char_width(VALUE self, VALUE name, VALUE value) {
+    return ULONG2NUM(encoding_find(name)->char_width((const uint8_t *) RSTRING_PTR(value), RSTRING_LEN(value)));
+}
+
+/**
+ * call-seq: Debug::Encoding.alnum?(source) -> true | false
+ *
+ * Returns true if the first character in the given string is an alphanumeric
+ * character in the encoding.
+ */
+static VALUE
+encoding_alnum_char(VALUE self, VALUE name, VALUE value) {
+    return encoding_find(name)->alnum_char((const uint8_t *) RSTRING_PTR(value), RSTRING_LEN(value)) > 0 ? Qtrue : Qfalse;
+}
+
+/**
+ * call-seq: Debug::Encoding.alpha?(source) -> true | false
+ *
+ * Returns true if the first character in the given string is an alphabetic
+ * character in the encoding.
+ */
+static VALUE
+encoding_alpha_char(VALUE self, VALUE name, VALUE value) {
+    return encoding_find(name)->alpha_char((const uint8_t *) RSTRING_PTR(value), RSTRING_LEN(value)) > 0 ? Qtrue : Qfalse;
+}
+
+/**
+ * call-seq: Debug::Encoding.upper?(source) -> true | false
+ *
+ * Returns true if the first character in the given string is an uppercase
+ * character in the encoding.
+ */
+static VALUE
+encoding_isupper_char(VALUE self, VALUE name, VALUE value) {
+    return encoding_find(name)->isupper_char((const uint8_t *) RSTRING_PTR(value), RSTRING_LEN(value)) ? Qtrue : Qfalse;
+}
+
 /******************************************************************************/
 /* Initialization of the extension                                            */
 /******************************************************************************/
@@ -1182,6 +1258,15 @@ Init_prism(void) {
     rb_define_singleton_method(rb_cPrismDebug, "inspect_node", inspect_node, 1);
     rb_define_singleton_method(rb_cPrismDebug, "format_errors", format_errors, 2);
 
+    // Next, define the functions that are exposed through the private
+    // Debug::Encoding class.
+    rb_cPrismDebugEncoding = rb_define_class_under(rb_cPrismDebug, "Encoding", rb_cObject);
+    rb_define_singleton_method(rb_cPrismDebugEncoding, "all", encoding_all, 0);
+    rb_define_singleton_method(rb_cPrismDebugEncoding, "_width", encoding_char_width, 2);
+    rb_define_singleton_method(rb_cPrismDebugEncoding, "_alnum?", encoding_alnum_char, 2);
+    rb_define_singleton_method(rb_cPrismDebugEncoding, "_alpha?", encoding_alpha_char, 2);
+    rb_define_singleton_method(rb_cPrismDebugEncoding, "_upper?", encoding_isupper_char, 2);
+
     // Next, initialize the other APIs.
     Init_prism_api_node();
     Init_prism_pack();
diff --git a/lib/prism/debug.rb b/lib/prism/debug.rb
@@ -202,5 +202,48 @@ def self.prism_locals(source)
     def self.newlines(source)
       Prism.parse(source).source.offsets
     end
+
+    # A wrapping around prism's internal encoding data structures. This is used
+    # for reflection and debugging purposes.
+    class Encoding
+      # The name of the encoding, that can be passed to Encoding.find.
+      attr_reader :name
+
+      # Initialize a new encoding with the given name and whether or not it is
+      # a multibyte encoding.
+      def initialize(name, multibyte)
+        @name = name
+        @multibyte = multibyte
+      end
+
+      # Whether or not the encoding is a multibyte encoding.
+      def multibyte?
+        @multibyte
+      end
+
+      # Returns the number of bytes of the first character in the source string,
+      # if it is valid for the encoding. Otherwise, returns 0.
+      def width(source)
+        Encoding._width(name, source)
+      end
+
+      # Returns true if the first character in the source string is a valid
+      # alphanumeric character for the encoding.
+      def alnum?(source)
+        Encoding._alnum?(name, source)
+      end
+
+      # Returns true if the first character in the source string is a valid
+      # alphabetic character for the encoding.
+      def alpha?(source)
+        Encoding._alpha?(name, source)
+      end
+
+      # Returns true if the first character in the source string is a valid
+      # uppercase character for the encoding.
+      def upper?(source)
+        Encoding._upper?(name, source)
+      end
+    end
   end
 end