=encoding ISO8859-1 =cut package Regexp::Grammars; use re 'eval'; use warnings; use strict; use 5.010; use vars (); use Scalar::Util qw< blessed reftype >; use Data::Dumper qw< Dumper >; our $VERSION = '1.058'; my $anon_scalar_ref = \do{my $var}; my $MAGIC_VARS = q{my ($CAPTURE, $CONTEXT, $DEBUG, $INDEX, $MATCH, %ARG, %MATCH);}; my $PROBLEM_WITH_5_18 = <<'END_ERROR_MSG'; Warning: Regexp::Grammars is unsupported under Perl 5.18.0 through 5.18.3 due to a bug in regex parsing under those versions. Please upgrade to Perl 5.18.4 or later, or revert to Perl 5.16 or earlier. END_ERROR_MSG # Load the module... sub import { # Signal lexical scoping (active, unless something was exported)... $^H{'Regexp::Grammars::active'} = 1; # Process any regexes in module's active lexical scope... use overload; overload::constant( qr => sub { my ($raw, $cooked, $type) = @_; # In active scope and really a regex... if (_module_is_active() && $type =~ /qq?/) { return bless \$cooked, 'Regexp::Grammars::Precursor'; } # Ignore everything else... else { return $cooked; } } ); # Deal with 5.18 issues... if ($] >= 5.018) { # Issue warning... if ($] < 5.018004) { require Carp; Carp::croak($PROBLEM_WITH_5_18); } # Deal with cases where Perl 5.18+ complains about # the injection of (??{...}) and (?{...}) require re; re->import('eval'); # Sanctify the standard Regexp::Grammars pseudo-variables from # Perl 5.18's early enforcement of strictures... my $caller = caller; warnings->unimport('once'); @_ = ( 'vars', '$CAPTURE', '$CONTEXT', '$DEBUG', '$INDEX', '$MATCH', '%ARG', '%MATCH' ); goto &vars::import; } } # Deactivate module's regex effect when it is "anti-imported" with 'no'... sub unimport { # Signal lexical (non-)scoping... $^H{'Regexp::Grammars::active'} = 0; require re; re->unimport('eval'); } # Encapsulate the hoopy user-defined pragma interface... sub _module_is_active { return (caller 1)[10]->{'Regexp::Grammars::active'}; } my $RULE_HANDLER; sub clear_rule_handler { undef $RULE_HANDLER; } sub Regexp::with_actions { my ($self, $handler) = @_; $RULE_HANDLER = $handler; return $self; } #=====[ COMPILE-TIME INTERIM REPRESENTATION OF GRAMMARS ]=================== { package Regexp::Grammars::Precursor; # Only translate precursors once... state %grammar_cache; use overload ( # Concatenation/interpolation just concatenates to the precursor... q{.} => sub { my ($x, $y, $reversed) = @_; if (ref $x) { $x = ${$x} } if (ref $y) { $y = ${$y} } if ($reversed) { ($y,$x) = ($x,$y); } $x .= $y//q{}; return bless \$x, 'Regexp::Grammars::Precursor'; }, # Using as a string (i.e. matching) preprocesses the precursor... q{""} => sub { my ($obj) = @_; return $grammar_cache{ overload::StrVal($$obj) } //= Regexp::Grammars::_build_grammar( ${$obj} ); }, # Everything else, as usual... fallback => 1, ); } #=====[ SUPPORT FOR THE INTEGRATED DEBUGGER ]========================= # All messages go to STDERR by default... *Regexp::Grammars::LOGFILE = *STDERR{IO}; # Debugging levels indicate where to stop... our %DEBUG_LEVEL = ( same => undef, # No change in debugging mode off => 0, # No more debugging run => 1, continue => 1, # Run to completion of regex match match => 2, on => 2, # Run to next successful submatch step => 3, try => 3, # Run to next reportable event ); # Debugging levels can be abbreviated to one character during interactions... @DEBUG_LEVEL{ map {substr($_,0,1)} keys %DEBUG_LEVEL } = values %DEBUG_LEVEL; $DEBUG_LEVEL{o} = $DEBUG_LEVEL{off}; # Not "on" $DEBUG_LEVEL{s} = $DEBUG_LEVEL{step}; # Not "same" # Width of leading context field in debugging messages is constrained... my $MAX_CONTEXT_WIDTH = 20; my $MIN_CONTEXT_WIDTH = 6; sub set_context_width { { package Regexp::Grammars::ContextRestorer; sub new { my ($class, $old_context_width) = @_; bless \$old_context_width, $class; } sub DESTROY { my ($old_context_width_ref) = @_; $MAX_CONTEXT_WIDTH = ${$old_context_width_ref}; } } my ($new_context_width) = @_; my $old_context_width = $MAX_CONTEXT_WIDTH; $MAX_CONTEXT_WIDTH = $new_context_width; if (defined wantarray) { return Regexp::Grammars::ContextRestorer->new($old_context_width); } } # Rewrite a string currently being matched, to make \n and \t visible sub _show_metas { my $context_str = shift // q{}; # Quote newlines (\n -> \\n, without using a regex)... my $index = index($context_str,"\n"); while ($index >= 0) { substr($context_str, $index, 1, '\\n'); $index = index($context_str,"\n",$index+2); } # Quote tabs (\t -> \\t, without using a regex)... $index = index($context_str,"\t"); while ($index >= 0) { substr($context_str, $index, 1, '\\t'); $index = index($context_str,"\t",$index+2); } return $context_str; } # Minimize whitespace in a string... sub _squeeze_ws { my ($str) = @_; $str =~ tr/\n\t/ /; my $index = index($str,q{ }); while ($index >= 0) { substr($str, $index, 2, q{ }); $index = index($str,q{ },$index); } return $str; } # Prepare for debugging... sub _init_try_stack { our (@try_stack, $last_try_pos, $last_context_str); # Start with a representation of the entire grammar match... @try_stack = ({ subrule => '', height => 0, errmsg => ' \\FAIL ', }); # Initialize tracking of location and context... $last_try_pos = -1; $last_context_str = q{}; # Report... say {*Regexp::Grammars::LOGFILE} _debug_context('=>') . 'Trying from position ' . pos(); } # Create a "context string" showing where the regex is currently matching... sub _debug_context { my ($fill_chars) = @_; # Determine minimal sufficient width for context field... my $field_width = length(_show_metas($_//q{})); if ($field_width > $MAX_CONTEXT_WIDTH) { $field_width = $MAX_CONTEXT_WIDTH; } elsif ($field_width < $MIN_CONTEXT_WIDTH) { $field_width = $MIN_CONTEXT_WIDTH; } # Get current matching position (and some additional trailing context)... my $context_str = substr(_show_metas(substr(($_//q{}).q{},pos()//0,$field_width)),0,$field_width); # Build the context string, handling special cases... our $last_context_str //= q{}; if ($fill_chars) { # If caller supplied a 1- or 2-char fill sequence, use that instead... my $last_fill_char = length($fill_chars) > 1 ? substr($fill_chars,-1,1,q{}) : $fill_chars ; $context_str = $fill_chars x ($field_width-1) . $last_fill_char; } else { # Make end-of-string visible in empty context string... if ($context_str eq q{}) { $context_str = '[eos]'; } # Don't repeat consecutive identical context strings... if ($context_str eq $last_context_str) { $context_str = q{ } x $field_width; } else { # If not repeating, remember for next time... $last_context_str = $context_str; } } # Left justify and return context string... return sprintf("%-*s ",$field_width,$context_str); } # Show a debugging message (mainly used for compile-time errors and info)... sub _debug_notify { # Single arg is a line to be printed with a null severity... my ($severity, @lines) = @_==1 ? (q{},@_) : @_; chomp @lines; # Formatting string for all lines... my $format = qq{%*s | %s\n}; # Track previous severity and avoid repeating the same level... state $prev_severity = q{}; if ($severity !~ /\S/) { # Do nothing } elsif ($severity eq 'info' && $prev_severity eq 'info' ) { $severity = q{}; } else { $prev_severity = $severity; } # Display first line with severity indicator (unless same as previous)... printf {*Regexp::Grammars::LOGFILE} $format, $MIN_CONTEXT_WIDTH, $severity, shift @lines; # Display first line without severity indicator for my $next_line (@lines) { printf {*Regexp::Grammars::LOGFILE} $format, $MIN_CONTEXT_WIDTH, q{}, $next_line; } } # Handle user interactions during runtime debugging... sub _debug_interact { my ($stack_height, $leader, $curr_frame_ref, $min_debug_level) = @_; our $DEBUG; # ...stores current debug level within regex # Only interact with terminals, and if debug level is appropriate... if (-t *Regexp::Grammars::LOGFILE && defined $DEBUG && ($DEBUG_LEVEL{$DEBUG}//0) >= $DEBUG_LEVEL{$min_debug_level} ) { local $/ = "\n"; # ...in case some caller is being clever INPUT: while (1) { my $cmd = readline // q{}; chomp $cmd; # Input of 'd' means 'display current result frame'... if ($cmd eq 'd') { print {*Regexp::Grammars::LOGFILE} join "\n", map { $leader . ($stack_height?'| ':q{}) . ' : ' . $_ } split "\n", q{ }x8 . substr(Dumper($curr_frame_ref),8); print "\t"; } # Any other (valid) input changes debugging level and continues... else { if (defined $DEBUG_LEVEL{$cmd}) { $DEBUG = $cmd; } last INPUT; } } } # When interaction not indicated, just complete the debugging line... else { print {*Regexp::Grammars::LOGFILE} "\n"; } } # Handle reporting of unsuccessful match attempts... sub _debug_handle_failures { my ($stack_height, $subrule, $in_match) = @_; our @try_stack; # Unsuccessful match attempts leave "leftovers" on the attempt stack... CLEANUP: while (@try_stack && $try_stack[-1]{height} >= $stack_height) { # Grab record of (potentially) unsuccessful attempt... my $error_ref = pop @try_stack; # If attempt was the one whose match is being reported, go and report... last CLEANUP if $in_match && $error_ref->{height} == $stack_height && $error_ref->{subrule} eq $subrule; # Otherwise, report the match failure... say {*Regexp::Grammars::LOGFILE} _debug_context(q{ }) . $error_ref->{errmsg}; } } # Handle attempts to call non-existent subrules... sub _debug_fatal { my ($naughty_construct) = @_; print {*Regexp::Grammars::LOGFILE} "_________________________________________________________________\n", "Fatal error: Entire parse terminated prematurely while attempting\n", " to call non-existent rule: $naughty_construct\n", "_________________________________________________________________\n"; $@ = "Entire parse terminated prematurely while attempting to call non-existent rule: $naughty_construct"; } # Handle objrules that don't return hashes... sub _debug_non_hash { my ($obj, $name) = @_; # If the object is okay, no further action required... return q{} if reftype($obj) eq 'HASH'; # Generate error messages... print {*Regexp::Grammars::LOGFILE} "_________________________________________________________________\n", "Fatal error: returned a non-hash-based object\n", "_________________________________________________________________\n"; $@ = " returned a non-hash-based object"; return '(*COMMIT)(*FAIL)'; } # Print a message in context... sub _debug_logmsg { my ($stack_height, @msg) = @_; # Determine indent for messages... my $leader = _debug_context() . q{| } x ($stack_height-1) . '|'; # Report the attempt... print {*Regexp::Grammars::LOGFILE} map { "$leader$_\n" } @msg; } # Print a message indicating a (sub)match attempt... sub _debug_trying { my ($stack_height, $curr_frame_ref, $subrule) = @_; # Clean up after any preceding unsuccessful attempts... _debug_handle_failures($stack_height, $subrule); # Determine indent for messages... my $leader = _debug_context() . q{| } x ($stack_height-2); # Detect and report any backtracking prior to this attempt... our $last_try_pos //= 0; #...Stores the pos() of the most recent match attempt? my $backtrack_distance = $last_try_pos - pos(); if ($backtrack_distance > 0) { say {*Regexp::Grammars::LOGFILE} ' <' . q{~} x (length(_debug_context(q{ }))-3) . q{ } . q{| } x ($stack_height-2) . qq{|...Backtracking $backtrack_distance char} . ($backtrack_distance > 1 ? q{s} : q{}) . q{ and trying new match} ; } # Report the attempt... print {*Regexp::Grammars::LOGFILE} $leader, "|...Trying $subrule\t"; # Handle user interactions during debugging... _debug_interact($stack_height, $leader, $curr_frame_ref, 'step'); # Record the attempt, for later error handling in _debug_matched()... if ($subrule ne 'next alternative') { our @try_stack; push @try_stack, { height => $stack_height, subrule => $subrule, # errmsg should align under: |...Trying $subrule\t errmsg => q{| } x ($stack_height-2) . "| \\FAIL $subrule", }; } $last_try_pos = pos(); } # Print a message indicating a successful (sub)match... sub _debug_matched { my ($stack_height, $curr_frame_ref, $subrule, $matched_text) = @_; # Clean up any intervening unsuccessful attempts... _debug_handle_failures($stack_height, $subrule, 'in match'); # Build debugging message... my $debug_context = _debug_context(); my $leader = $debug_context . q{| } x ($stack_height-2); my $message = ($stack_height ? '| ' : q{}) . " \\_____$subrule matched "; my $filler = $stack_height ? '| ' . q{ } x (length($message)-4) : q{ } x length($message); our $last_try_pos //= 0; #...Stores the pos() of the most recent match attempt? # Report if match required backtracking... my $backtrack_distance = $last_try_pos - (pos()//0); if ($backtrack_distance > 0) { say {*Regexp::Grammars::LOGFILE} ' <' . q{~} x (length(_debug_context(q{ }))-3) . q{ } . q{| } x ($stack_height-2) . qq{|...Backtracking $backtrack_distance char} . ($backtrack_distance > 1 ? q{s} : q{}) . qq{ and rematching $subrule} ; } $last_try_pos = pos(); # Format match text (splitting multi-line texts and indent them correctly)... $matched_text = defined($matched_text) ? $matched_text = q{'} . join("\n$leader$filler", split "\n", $matched_text) . q{'} : q{}; # Print match message... print {*Regexp::Grammars::LOGFILE} $leader . $message . $matched_text . qq{\t}; # Check for user interaction... _debug_interact($stack_height, $leader, $curr_frame_ref, $stack_height ? 'match' : 'run'); } # Print a message indicating a successful (sub)match... sub _debug_require { my ($stack_height, $condition, $succeeded) = @_; # Build debugging message... my $debug_context = _debug_context(); my $leader = $debug_context . q{| } x ($stack_height-1); my $message1 = ($stack_height ? '|...' : q{}) . "Testing condition: $condition" ; my $message2 = ($stack_height ? '| ' : q{}) . " \\_____" . ($succeeded ? 'Satisfied' : 'FAILED') ; # Report if match required backtracking... our $last_try_pos; my $backtrack_distance = $last_try_pos - pos(); if ($backtrack_distance > 0) { say {*Regexp::Grammars::LOGFILE} ' <' . q{~} x (length(_debug_context(q{ }))-3) . q{ } . q{| } x ($stack_height-1) . qq{|...Backtracking $backtrack_distance char} . ($backtrack_distance > 1 ? q{s} : q{}) . qq{ and rematching} ; } # Remember where the condition was tried... $last_try_pos = pos(); # Print match message... say {*Regexp::Grammars::LOGFILE} $leader . $message1; say {*Regexp::Grammars::LOGFILE} $leader . $message2; } # Print a message indicating a successful store-result-of-code-block... sub _debug_executed { my ($stack_height, $curr_frame_ref, $subrule, $value) = @_; # Build message... my $leader = _debug_context() . q{| } x ($stack_height-2); my $message = "|...Action $subrule\n"; my $message2 = "| saved value: '"; $message .= $leader . $message2; my $filler = q{ } x length($message2); # Split multiline results over multiple lines (properly indented)... $value = join "\n$leader$filler", split "\n", $value; # Report the action... print {*Regexp::Grammars::LOGFILE} $leader . $message . $value . qq{'\t}; # Check for user interaction... _debug_interact($stack_height, $leader, $curr_frame_ref, 'match'); } # Create the code to be inserted into the regex to facilitate debugging... sub _build_debugging_statements { my ($debugging_active, $subrule, $extra_pre_indent) = @_; return (q{}, q{}) if ! $debugging_active;; $extra_pre_indent //= 0; $subrule = "q{$subrule}"; return ( qq{Regexp::Grammars::_debug_trying(\@Regexp::Grammars::RESULT_STACK+$extra_pre_indent, \$Regexp::Grammars::RESULT_STACK[-2+$extra_pre_indent], $subrule) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG};}, qq{Regexp::Grammars::_debug_matched(\@Regexp::Grammars::RESULT_STACK+1, \$Regexp::Grammars::RESULT_STACK[-1], $subrule, \$^N) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG};}, ); } sub _build_raw_debugging_statements { my ($debugging_active, $subpattern, $extra_pre_indent) = @_; return (q{}, q{}) if ! $debugging_active; $extra_pre_indent //= 0; if ($subpattern eq '|') { return ( q{}, qq{(?{;Regexp::Grammars::_debug_trying(\@Regexp::Grammars::RESULT_STACK+$extra_pre_indent, \$Regexp::Grammars::RESULT_STACK[-2+$extra_pre_indent], 'next alternative') if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG};})}, ); } else { return ( qq{(?{;Regexp::Grammars::_debug_trying(\@Regexp::Grammars::RESULT_STACK+$extra_pre_indent, \$Regexp::Grammars::RESULT_STACK[-2+$extra_pre_indent], q{subpattern /$subpattern/}, \$^N) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG};})}, qq{(?{;Regexp::Grammars::_debug_matched(\@Regexp::Grammars::RESULT_STACK+1, \$Regexp::Grammars::RESULT_STACK[-1], q{subpattern /$subpattern/}, \$^N) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG};})}, ); } } #=====[ SUPPORT FOR AUTOMATIC TIMEOUTS ]========================= sub _test_timeout { our ($DEBUG, $TIMEOUT); return q{} if time() < $TIMEOUT->{'limit'}; my $duration = "$TIMEOUT->{duration} second" . ( $TIMEOUT->{duration} == 1 ? q{} : q{s} ); if (defined($DEBUG) && $DEBUG ne 'off') { my $leader = _debug_context(q{ }); say {*LOGFILE} $leader . '|'; say {*LOGFILE} $leader . "|...Invoking {duration}>"; say {*LOGFILE} $leader . "| \\_____No match after $duration"; say {*LOGFILE} $leader . '|'; say {*LOGFILE} $leader . " \\FAIL "; } if (! @!) { @! = "Internal error: Timed out after $duration (as requested)"; } return q{(*COMMIT)(*FAIL)}; } #=====[ SUPPORT FOR UPDATING THE RESULT STACK ]========================= # Create a clone of the current result frame with an new key/value... sub _extend_current_result_frame_with_scalar { my ($stack_ref, $key, $value) = @_; # Autovivify null stacks (only occur when grammar invokes no subrules)... if (!@{$stack_ref}) { $stack_ref = [{}]; } # Copy existing frame, appending new value so it overwrites any old value... my $cloned_result_frame = { %{$stack_ref->[-1]}, $key => $value, }; # Make the copy into an object, if the original was one... if (my $class = blessed($stack_ref->[-1])) { bless $cloned_result_frame, $class; } return $cloned_result_frame; } # Create a clone of the current result frame with an additional key/value # (As above, but preserving the "listiness" of the key being added to)... sub _extend_current_result_frame_with_list { my ($stack_ref, $key, $value) = @_; # Copy existing frame, appending new value to appropriate element's list... my $cloned_result_frame = { %{$stack_ref->[-1]}, $key => [ @{$stack_ref->[-1]{$key}//[]}, $value, ], }; # Make the copy into an object, if the original was one... if (my $class = blessed($stack_ref->[-1])) { bless $cloned_result_frame, $class; } return $cloned_result_frame; } # Pop current result frame and add it to a clone of previous result frame # (flattening it if possible, and preserving any blessing)... sub _pop_current_result_frame { my ($stack_ref, $key, $original_name, $value) = @_; # Where are we in the stack? my $curr_frame = $stack_ref->[-1]; my $caller_frame = $stack_ref->[-2]; # Track which frames are objects... my $is_blessed_curr = blessed($curr_frame); my $is_blessed_caller = blessed($caller_frame); # Remove "private" captures (i.e. those starting with _)... delete @{$curr_frame}{grep {substr($_,0,1) eq '_'} keys %{$curr_frame} }; # Remove "nocontext" marker... my $nocontext = delete $curr_frame->{'~'}; # Build a clone of the current frame... my $cloned_result_frame = exists $curr_frame->{'='} ? $curr_frame->{'='} : $is_blessed_curr || length(join(q{}, keys %{$curr_frame})) ? { q{} => $value, %{$curr_frame} } : keys %{$curr_frame} ? $curr_frame->{q{}} : $value ; # Apply any appropriate handler... if ($RULE_HANDLER) { if ($RULE_HANDLER->can($original_name) || $RULE_HANDLER->can('AUTOLOAD')) { my $replacement_result_frame = $RULE_HANDLER->$original_name($cloned_result_frame); if (defined $replacement_result_frame) { $cloned_result_frame = $replacement_result_frame; } } } # Remove capture if not requested... if ($nocontext && ref $cloned_result_frame eq 'HASH' && keys %{$cloned_result_frame} > 1) { delete $cloned_result_frame->{q{}}; } # Nest a clone of current frame inside a clone of the caller frame... my $cloned_caller_frame = { %{$caller_frame//{}}, $key => $cloned_result_frame, }; # Make the copies into objects, if the originals were... if ($is_blessed_curr && !exists $curr_frame->{'='} ) { bless $cloned_caller_frame->{$key}, $is_blessed_curr; } if ($is_blessed_caller) { bless $cloned_caller_frame, $is_blessed_caller; } return $cloned_caller_frame; } # Pop current result frame and add it to a clone of previous result frame # (flattening it if possible, and preserving any blessing) # (As above, but preserving listiness of key being added to)... sub _pop_current_result_frame_with_list { my ($stack_ref, $key, $original_name, $value) = @_; # Where are we in the stack? my $curr_frame = $stack_ref->[-1]; my $caller_frame = $stack_ref->[-2]; # Track which frames are objects... my $is_blessed_curr = blessed($curr_frame); my $is_blessed_caller = blessed($caller_frame); # Remove "private" captures (i.e. those starting with _)... delete @{$curr_frame}{grep {substr($_,0,1) eq '_'} keys %{$curr_frame} }; # Remove "nocontext" marker... my $nocontext = delete $curr_frame->{'~'}; # Clone the current frame... my $cloned_result_frame = exists $curr_frame->{'='} ? $curr_frame->{'='} : $is_blessed_curr || length(join(q{}, keys %{$curr_frame})) ? { q{} => $value, %{$curr_frame} } : keys %{$curr_frame} ? $curr_frame->{q{}} : $value ; # Apply any appropriate handler... if ($RULE_HANDLER) { if ($RULE_HANDLER->can($original_name) || $RULE_HANDLER->can('AUTOLOAD')) { my $replacement_result_frame = $RULE_HANDLER->$original_name($cloned_result_frame); if (defined $replacement_result_frame) { $cloned_result_frame = $replacement_result_frame; } } } # Remove capture if not requested... if ($nocontext && ref $cloned_result_frame eq 'HASH' && keys %{$cloned_result_frame} > 1) { delete $cloned_result_frame->{q{}}; } # Append a clone of current frame inside a clone of the caller frame... my $cloned_caller_frame = { %{$caller_frame}, $key => [ @{$caller_frame->{$key}//[]}, $cloned_result_frame, ], }; # Make the copies into objects, if the originals were... if ($is_blessed_curr && !exists $curr_frame->{'='} ) { bless $cloned_caller_frame->{$key}[-1], $is_blessed_curr; } if ($is_blessed_caller) { bless $cloned_caller_frame, $is_blessed_caller; } return $cloned_caller_frame; } #=====[ MISCELLANEOUS CONSTANTS ]========================= # Namespace in which grammar inheritance occurs... my $CACHE = 'Regexp::Grammars::_CACHE_::'; my $CACHE_LEN = length $CACHE; my %CACHE; #...for subrule tracking # This code inserted at the start of every grammar regex # (initializes the result stack cleanly and backtrackably, via local)... my $PROLOGUE = q{((?{; @! = () if !pos; local @Regexp::Grammars::RESULT_STACK = (@Regexp::Grammars::RESULT_STACK, {}); local $Regexp::Grammars::TIMEOUT = { limit => -1>>1 }; local $Regexp::Grammars::DEBUG = 'off' }) }; # This code inserted at the end of every grammar regex # (puts final result in %/. Also defines default , , etc.)... my $EPILOGUE = q{)(?{; $Regexp::Grammars::RESULT_STACK[-1]{q{}} //= $^N;; local $Regexp::Grammars::match_frame = pop @Regexp::Grammars::RESULT_STACK; delete @{$Regexp::Grammars::match_frame}{ '~', grep {substr($_,0,1) eq '_'} keys %{$Regexp::Grammars::match_frame} }; if (exists $Regexp::Grammars::match_frame->{'='}) { if (ref($Regexp::Grammars::match_frame->{'='}) eq 'HASH') { $Regexp::Grammars::match_frame = $Regexp::Grammars::match_frame->{'='}; } } if (@Regexp::Grammars::RESULT_STACK) { $Regexp::Grammars::RESULT_STACK[-1]{'(?R)'} = $Regexp::Grammars::match_frame; } Regexp::Grammars::clear_rule_handler(); */ = $Regexp::Grammars::match_frame; })|\Z(?{Regexp::Grammars::clear_rule_handler();})(?!)(?(DEFINE) (? \\s* ) (? (?{$Regexp::Grammars::RESULT_STACK[-1]{'!'}=$#{!};}) \\s* (?{;$#{!}=delete($Regexp::Grammars::RESULT_STACK[-1]{'!'})//0; delete($Regexp::Grammars::RESULT_STACK[-1]{'@'}); }) ) (? \\S+ ) (? (?{$Regexp::Grammars::RESULT_STACK[-1]{'!'}=$#{!};}) \\S+ (?{;$#{!}=delete($Regexp::Grammars::RESULT_STACK[-1]{'!'})//0; delete($Regexp::Grammars::RESULT_STACK[-1]{'@'}); }) ) (? (?{; $Regexp::Grammars::RESULT_STACK[-1]{"="} = pos; }) ) (? (?{; $Regexp::Grammars::RESULT_STACK[-1]{"="} = 1 + substr($_,0,pos) =~ tr/\n/\n/; }) ) ) }; my $EPILOGUE_NC = $EPILOGUE; $EPILOGUE_NC =~ s{ ; .* ;;}{;}xms; #=====[ MISCELLANEOUS PATTERNS THAT MATCH USEFUL THINGS ]======== # Match an identifier... my $IDENT = qr{ [^\W\d] \w*+ }xms; my $QUALIDENT = qr{ (?: $IDENT :: )*+ $IDENT }xms; # Match balanced parentheses, taking into account \-escapes and []-escapes... my $PARENS = qr{ (?&VAR_PARENS) (?(DEFINE) (? \( (?: \\. | (?&VAR_PARENS) | (?&CHARSET) | [^][()\\]++)*+ \) ) (? \[ \^?+ \]?+ (?: \[:\w+:\] | \\. | [^]])*+ \] ) ) }xms; # Match a directive within rules... my $WS_PATTERN = qr{]++ | $PARENS )*+) >}xms; #=====[ UTILITY SUBS FOR ERROR AND WARNING MESSAGES ]======== sub _uniq { my %seen; return grep { defined $_ && !$seen{$_}++ } @_; } # Default translator for error messages... my $ERRORMSG_TRANSLATOR = sub { my ($errormsg, $rulename, $context) = @_; $rulename = 'valid input' if $rulename eq q{}; $context //= ''; # Unimplemented subrule when rulename starts with '-'... if (substr($rulename,0,1) eq '-') { $rulename = substr($rulename,1); return "Can't match subrule <$rulename> (not implemented)"; } # Empty message converts to a "Expected...but found..." message... if ($errormsg eq q{}) { $rulename =~ tr/_/ /; $rulename = lc($rulename); return "Expected $rulename, but found '$context' instead"; } # "Expecting..." messages get "but found" added... if (lc(substr($errormsg,0,6)) eq 'expect') { return "$errormsg, but found '$context' instead"; } # Everything else stays "as is"... return $errormsg; }; # Allow user to set translation... sub set_error_translator { { package Regexp::Grammars::TranslatorRestorer; sub new { my ($class, $old_translator) = @_; bless \$old_translator, $class; } sub DESTROY { my ($old_translator_ref) = @_; $ERRORMSG_TRANSLATOR = ${$old_translator_ref}; } } my ($translator_ref) = @_; die "Usage: set_error_translator(\$subroutine_reference)\n" if ref($translator_ref) ne 'CODE'; my $old_translator_ref = $ERRORMSG_TRANSLATOR; $ERRORMSG_TRANSLATOR = $translator_ref; return defined wantarray ? Regexp::Grammars::TranslatorRestorer->new($old_translator_ref) : (); } # Dispatch to current translator for error messages... sub _translate_errormsg { goto &{$ERRORMSG_TRANSLATOR}; } #=====[ SUPPORT FOR TRANSLATING GRAMMAR-ENHANCED REGEX TO NATIVE REGEX ]==== # Store any specified grammars... my %user_defined_grammar; my %REPETITION_DESCRIPTION_FOR = ( '+' => 'once or more', '*' => 'any number of times', '?' => 'if possible', '+?' => 'as few times as possible', '*?' => 'as few times as possible', '??' => 'if necessary', '++' => 'as many times as possible', '*+' => 'as many times as possible', '?+' => 'if possible', ); sub _translate_raw_regex { my ($regex, $debug_build, $debug_runtime) = @_; my $is_comment = substr($regex, 0, 1) eq q{#} || substr($regex, 0, 3) eq q{(?#}; my $visible_regex = _squeeze_ws($regex); # Report how regex was interpreted, if requested to... if ($debug_build && $visible_regex ne q{} && $visible_regex ne q{ }) { _debug_notify( info => " |", " |...Treating '$visible_regex' as:", ($is_comment ? " | \\ a comment (which will be ignored)" : " | \\ normal Perl regex syntax" ), ); } return q{} if $is_comment; # Generate run-time debugging code (if any)... my ($debug_pre, $debug_post) = _build_raw_debugging_statements($debug_runtime,$visible_regex, +1); # Replace negative lookahead with one that works under R::G... $regex =~ s{\(\?!}{(?!(?!)|}gxms; # TODO: Also replace positive lookahead with one that works under R::G... # This replacement should be of the form: # $regex =~ s{\(\?!}{(?!(?!)|(?!(?!)|}gxms; # but need to find a way to insert the extra ) at the other end return $debug_runtime && $regex eq '|' ? $regex . $debug_post : $debug_runtime && $regex =~ /\S/ ? "(?#)(?:$debug_pre($regex)$debug_post(?#))" # TODO: REWORK THIS INSUFFICENT FIX FOR t/grammar_autospace.t... # : $regex !~ /\S/ ? "(?:$regex)" : $regex; } # Report and convert a debugging directive... sub _translate_debug_directive { my ($construct, $cmd, $debug_build) = @_; # Report how directive was interpreted, if requested to... if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", " | \\ Change run-time debugging mode to '$cmd'", ); } return qq{(?{; local \$Regexp::Grammars::DEBUG = q{$cmd}; }) }; } # Report and convert a timeout directive... sub _translate_timeout_directive { my ($construct, $timeout, $debug_build) = @_; # Report how directive was interpreted, if requested to... if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", ($timeout > 0 ? " | \\ Cause the entire parse to fail after $timeout second" . ($timeout==1 ? q{} : q{s}) : " | \\ Cause the entire parse to fail immediately" ), ); } return $timeout > 0 ? qq{(?{; local \$Regexp::Grammars::TIMEOUT = { duration => $timeout, limit => time() + $timeout }; }) } : qq{(*COMMIT)(*FAIL)}; } # Report and convert a directive... sub _translate_require_directive { my ($construct, $condition, $debug_build) = @_; $condition = substr($condition, 3, -2); # Report how directive was interpreted, if requested to... if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", " | \\ Require that {$condition} is true", ); } my $quoted_condition = $condition; $quoted_condition =~ s{\$}{}xms; return qq{(?(?{;$condition}) (?{;Regexp::Grammars::_debug_require( scalar \@Regexp::Grammars::RESULT_STACK, q{$quoted_condition}, 1) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG}}) | (?{;Regexp::Grammars::_debug_require( scalar \@Regexp::Grammars::RESULT_STACK, q{$quoted_condition}, 0) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG}})(?!)) }; } # Report and convert a directive... sub _translate_minimize_directive { my ($construct, $debug_build) = @_; # Report how directive was interpreted, if requested to... if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", " | \\ Minimize result value if possible", ); } return q{(?{; if (1 == grep { $_ ne '!' && $_ ne '@' && $_ ne '~' } keys %MATCH) { # ...single alnum key local %Regexp::Grammars::matches = %MATCH; delete @Regexp::Grammars::matches{'!', '@', '~'}; local ($Regexp::Grammars::only_key) = keys %Regexp::Grammars::matches; local $Regexp::Grammars::array_ref = $MATCH{$Regexp::Grammars::only_key}; if (ref($Regexp::Grammars::array_ref) eq 'ARRAY' && 1 == @{$Regexp::Grammars::array_ref}) { $MATCH = $Regexp::Grammars::array_ref->[0]; } } })}; } # Report and convert a debugging directive... sub _translate_error_directive { my ($construct, $type, $msg, $debug_build, $subrule_name) = @_; $subrule_name //= 'undef'; # Determine severity... my $severity = ($type eq 'error') ? 'fail' : 'non-fail'; # Determine fatality (and build code to invoke it)... my $fatality = ($type eq 'fatal') ? '(*COMMIT)(*FAIL)' : q{}; # Unpack message... if (substr($msg,0,3) eq '(?{') { $msg = 'do'. substr($msg,2,-1); } else { $msg = quotemeta $msg; $msg = qq{qq{$msg}}; } # Report how directive was interpreted, if requested to... if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", ( $type eq 'log' ? " | \\ Log a message to the logfile" : " | \\ Append a $severity error message to \@!" ), ); } # Generate the regex... return $type eq 'log' ? qq{(?{Regexp::Grammars::_debug_logmsg(scalar \@Regexp::Grammars::RESULT_STACK,$msg) if \$Regexp::Grammars::DEBUG_LEVEL{\$Regexp::Grammars::DEBUG} })} : qq{(?:(?{;local \$Regexp::Grammar::_memopos=pos();}) (?>\\s*+((?-s).{0,$MAX_CONTEXT_WIDTH}+)) (?{; pos() = \$Regexp::Grammar::_memopos; @! = Regexp::Grammars::_uniq( @!, Regexp::Grammars::_translate_errormsg($msg,q{$subrule_name},\$CONTEXT) ) }) (?!)|} . ($severity eq 'fail' ? q{(?!)} : $fatality) . q{)} ; } sub _translate_subpattern { my ($construct, $alias, $subpattern, $savemode, $postmodifier, $debug_build, $debug_runtime, $timeout, $backref) = @_; # Determine save behaviour... my $is_noncapturing = $savemode eq 'noncapturing'; my $is_listifying = $savemode eq 'list'; my $is_codeblock = substr($subpattern,0,3) eq '(?{'; my $value_saved = $is_codeblock ? '$^R' : '$^N'; my $do_something_with = $is_codeblock ? 'execute the code block' : 'match the pattern'; my $result = $is_codeblock ? 'result' : 'matched substring'; my $description = $is_codeblock ? substr($subpattern,2,-1) : defined $backref ? $backref : $subpattern; my $debug_construct = $is_codeblock ? '<' . substr($alias,1,-1) . '= (?{;' . substr($subpattern,3,-2) . '})>' : $construct ; # Report how construct was interpreted, if requested to... my $repeatedly = $REPETITION_DESCRIPTION_FOR{$postmodifier} // q{}; my $results = $is_listifying && $postmodifier ? "each $result" : substr($postmodifier,0,1) eq '?' ? "any $result" : $postmodifier && !$is_noncapturing ? "only the final $result" : "the $result" ; if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", " | | $do_something_with $description $repeatedly", ( $is_noncapturing ? " | \\ but don't save $results" : $is_listifying ? " | \\ appending $results to \@{\$MATCH{$alias}}" : " | \\ saving $results in \$MATCH{$alias}" ) ); } # Generate run-time debugging code (if any)... my ($debug_pre, $debug_post) = _build_debugging_statements($debug_runtime,$debug_construct, +1); # Generate post-match result-capturing code, if match captures... my $post_action = $is_noncapturing ? q{} : qq{local \@Regexp::Grammars::RESULT_STACK = ( \@Regexp::Grammars::RESULT_STACK[0..\@Regexp::Grammars::RESULT_STACK-2], Regexp::Grammars::_extend_current_result_frame_with_$savemode( \\\@Regexp::Grammars::RESULT_STACK, $alias, $value_saved ), );} ; # Generate timeout test... my $timeout_test = $timeout ? q{(??{;Regexp::Grammars::_test_timeout()})} : q{}; # Translate to standard regex code... return qq{$timeout_test(?{;local \@Regexp::Grammars::RESULT_STACK = \@Regexp::Grammars::RESULT_STACK;$debug_pre})(?:($subpattern)(?{;$post_action$debug_post}))$postmodifier}; } sub _translate_hashmatch { my ($construct, $alias, $hashname, $keypat, $savemode, $postmodifier, $debug_build, $debug_runtime, $timeout) = @_; # Empty or missing keypattern defaults to <.hk>... if (!defined $keypat || $keypat !~ /\S/) { $keypat = '(?&hk__implicit__)' } else { $keypat = substr($keypat, 1, -1); } # Determine save behaviour... my $is_noncapturing = $savemode eq 'noncapturing'; my $is_listifying = $savemode eq 'list'; # Convert hash to hash lookup... my $hash_lookup = '$' . substr($hashname, 1). '{$^N}'; # Report how construct was interpreted, if requested to... my $repeatedly = $REPETITION_DESCRIPTION_FOR{$postmodifier} // q{}; my $results = $is_listifying && $postmodifier ? 'each matched key' : substr($postmodifier,0,1) eq '?' ? 'any matched key' : $postmodifier && !$is_noncapturing ? 'only the final matched key' : 'the matched key' ; if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", " | | match a key from the hash $hashname $repeatedly", ( $is_noncapturing ? " | \\ but don't save $results" : $is_listifying ? " | \\ appending $results to \$MATCH{$alias}" : " | \\ saving $results in \$MATCH{$alias}" ) ); } # Generate run-time debugging code (if any)... my ($debug_pre, $debug_post) = _build_debugging_statements($debug_runtime,$construct, +1); # Generate post-match result-capturing code, if match captures... my $post_action = $is_noncapturing ? q{} : qq{local \@Regexp::Grammars::RESULT_STACK = ( \@Regexp::Grammars::RESULT_STACK[0..\@Regexp::Grammars::RESULT_STACK-2], Regexp::Grammars::_extend_current_result_frame_with_$savemode( \\\@Regexp::Grammars::RESULT_STACK, $alias, \$^N ), );} ; # Generate timeout test... my $timeout_test = $timeout ? q{(??{;Regexp::Grammars::_test_timeout()})} : q{}; # Translate to standard regex code... return qq{$timeout_test(?:(?{;local \@Regexp::Grammars::RESULT_STACK = \@Regexp::Grammars::RESULT_STACK;$debug_pre})(?:($keypat)(??{exists $hash_lookup ? q{} : q{(?!)}})(?{;$post_action$debug_post})))$postmodifier}; } # Convert a " % " construct to pure Perl 5.10... sub _translate_separated_list { my ($term, $op, $separator, $term_trans, $sep_trans, $ws, $debug_build, $debug_runtime, $timeout) = @_; # This insertion ensures backtracking upwinds the stack correctly... state $CHECKPOINT = q{(?{;@Regexp::Grammars::RESULT_STACK = @Regexp::Grammars::RESULT_STACK;})}; # Translate meaningful whitespace... $ws = length($ws) ? q{(?&ws__implicit__)} : q{}; # Generate support for optional trailing separator... my $opt_trailing = substr($op,-2) eq '%%' ? qq{$ws$sep_trans?} : q{}; # Generate timeout test... my $timeout_test = $timeout ? q{(??{;Regexp::Grammars::_test_timeout()})} : q{}; # Report how construct was interpreted, if requested to... if ($debug_build) { _debug_notify( info => " |", " |...Treating $term $op $separator as:", " | | repeatedly match the subrule $term", " | \\ as long as the matches are separated by matches of $separator", (substr($op,-2) eq '%%' ? " | \\ and allowing an optional trailing $separator" : q{} ) ); } # One-or-more... return qq{$timeout_test(?:$ws$CHECKPOINT$sep_trans$ws$term_trans)*$+$opt_trailing} if $op =~ m{ [*][*]() | [+]([+?]?) \s* %%?+ | \{ 1, \}([+?]?) \s* %%?+ }xms; # Zero-or-more... return qq{{0}$timeout_test$ws(?:$term_trans(?:$ws$CHECKPOINT$sep_trans$ws$term_trans)*$+)?$+$opt_trailing} if $op =~ m{ [*]([+?]?) \s* %%? | \{ 0, \}([+?]?) \s* %%? }xms; # One-or-zero... return qq{?$+$opt_trailing} if $op =~ m{ [?]([+?]?) \s* %%? | \{ 0,1 \}([+?]?) \s* %%? }xms; # Zero exactly... return qq{{0}$ws$opt_trailing} if $op =~ m{ \{ 0 \}[+?]? \s* %%? }xms; # N exactly... if ($op =~ m{ \{ (\d+) \}([+?]?) \s* %%? }xms ) { my $min = $1-1; return qq{{0}$timeout_test$ws(?:$term_trans(?:$ws$CHECKPOINT$sep_trans$ws$term_trans){$min}$+$opt_trailing)} } # Zero-to-N... if ($op =~ m{ \{ 0,(\d+) \}([+?]?) \s* %%? }xms ) { my $max = $1-1; return qq{{0}$timeout_test$ws(?:$term_trans(?:$ws$CHECKPOINT$sep_trans$ws$term_trans){0,$max}$+)?$+$opt_trailing} } # M-to-N and M-to-whatever... if ($op =~ m{ \{ (\d+),(\d*) \} ([+?]?) \s* %%? }xms ) { my $min = $1-1; my $max = $2 ? $2-1 : q{}; return qq{{0}$timeout_test$ws(?:$term_trans(?:$ws$CHECKPOINT$sep_trans$ws$term_trans){$min,$max}$+$opt_trailing)} } # Somehow we missed a case (this should never happen)... die "Internal error: missing case in separated list handler"; } sub _translate_subrule_call { my ($source_line, $source_file, $rulename, $grammar_name, $construct, $alias, $subrule, $args, $savemode, $postmodifier, $debug_build, $debug_runtime, $timeout, $valid_subrule_names_ref) = @_; # Translate arg list, if provided... my $arg_desc; if ($args eq q{}) { $args = q{()}; } elsif (substr($args,0,3) eq '(?{') { # Turn parencode into do block... $arg_desc = substr($args,3,-2); substr($args,1,1) = 'do'; } else { # Turn abbreviated format into a key=>value list... $args =~ s{ [(,] \s* \K : (\w+) (?= \s* [,)] ) }{$1 => \$MATCH{'$1'}}gxms; $arg_desc = substr($args,1,-1); } # Transform qualified subrule names... my $simple_subrule = $subrule; my $start_grammar = (($simple_subrule =~ s{(.*)::}{}xms) ? $1 : ""); if ($start_grammar !~ /^NEXT$|::/) { $start_grammar = caller(3).'::'.$start_grammar; } my @candidates = $start_grammar eq 'NEXT' ? _ancestry_of($grammar_name) : _ancestry_of($start_grammar); # Rename fully-qualified rule call, if to ancestor grammar... RESOLVING: for my $parent_class (@candidates) { my $inherited_subrule = $parent_class.'::'.$simple_subrule; if ($CACHE{$inherited_subrule}) { $subrule = $inherited_subrule; last RESOLVING; } } # Replace package separators, which regex engine can't handle... my $internal_subrule = $subrule; $internal_subrule =~ s{::}{_88_}gxms; # Shortcircuit if unknown subrule invoked... if (!$valid_subrule_names_ref->{$subrule}) { _debug_notify( error => qq{Found call to $construct inside definition of $rulename}, qq{near $source_file line $source_line.}, qq{But no or was defined in the grammar}, qq{(Did you misspell $construct? Or forget to define the rule?)}, q{}, ); return "(?{Regexp::Grammars::_debug_fatal('$construct')})(*COMMIT)(*FAIL)"; } # Determine save behaviour... my $is_noncapturing = $savemode =~ /noncapturing|lookahead/; my $is_listifying = $savemode eq 'list'; my $save_code = $is_noncapturing? q{ @Regexp::Grammars::RESULT_STACK[0..@Regexp::Grammars::RESULT_STACK-2] } : $is_listifying? qq{ \@Regexp::Grammars::RESULT_STACK[0..\@Regexp::Grammars::RESULT_STACK-3], Regexp::Grammars::_pop_current_result_frame_with_list( \\\@Regexp::Grammars::RESULT_STACK, $alias, '$simple_subrule', \$^N ), } : qq{ \@Regexp::Grammars::RESULT_STACK[0..\@Regexp::Grammars::RESULT_STACK-3], Regexp::Grammars::_pop_current_result_frame( \\\@Regexp::Grammars::RESULT_STACK, $alias, '$simple_subrule', \$^N ), } ; # Report how construct was interpreted, if requested to... my $repeatedly = $REPETITION_DESCRIPTION_FOR{$postmodifier} // q{}; my $results = $is_listifying && $postmodifier ? 'each match' : substr($postmodifier,0,1) eq '?' ? 'any match' : 'the match' ; my $do_something_with = $savemode eq 'neglookahead' ? 'lookahead for anything except' : $savemode eq 'poslookahead' ? 'lookahead for' : 'match' ; if ($debug_build) { _debug_notify( info => " |", " |...Treating $construct as:", " | | $do_something_with the subrule <$subrule> $repeatedly", (defined $arg_desc ? " | | passing the args: ($arg_desc)" : () ), ( $is_noncapturing ? " | \\ but don't save anything" : $is_listifying ? " | \\ appending $results to \$MATCH{$alias}" : " | \\ saving $results in \$MATCH{$alias}" ), ); } # Generate post-match result-capturing code, if match captures... my ($debug_pre, $debug_post) = _build_debugging_statements($debug_runtime, $construct); # Generate timeout test... my $timeout_test = $timeout ? q{(??{;Regexp::Grammars::_test_timeout()})} : q{}; # Translate to standard regex code... return qq{(?:$timeout_test(?{; local \@Regexp::Grammars::RESULT_STACK = (\@Regexp::Grammars::RESULT_STACK, {'\@'=>{$args}}); $debug_pre})((?&$internal_subrule))(?{; local \@Regexp::Grammars::RESULT_STACK = ( $save_code );$debug_post }))$postmodifier}; } sub _translate_rule_def { my ($type, $qualifier, $name, $callname, $qualname, $body, $objectify, $local_ws, $nocontext) = @_; $qualname =~ s{::}{_88_}gxms; # Return object if requested... my $objectification = $objectify ? qq{(??{; local \@Regexp::Grammars::RESULT_STACK = \@Regexp::Grammars::RESULT_STACK; \$Regexp::Grammars::RESULT_STACK[-1] = '$qualifier$name'->can('new') ? '$qualifier$name'->new(\$Regexp::Grammars::RESULT_STACK[-1]) : bless \$Regexp::Grammars::RESULT_STACK[-1], '$qualifier$name'; Regexp::Grammars::_debug_non_hash(\$Regexp::Grammars::RESULT_STACK[-1],'$name'); })} : q{}; # Each rule or token becomes a DEFINE'd Perl 5.10 named capture... my $implicit_version = ($callname eq 'ws' || $callname eq 'hk') ? qq{(?<${callname}__implicit__> $body) } : qq{}; return qq{ (?(DEFINE) $local_ws (?<$qualname> (?<$callname> (?{\@{\$Regexp::Grammars::RESULT_STACK[-1]}{'!','~'}=(\$#{!}, $nocontext);}) (?:$body) $objectification (?{;\$#{!}=delete(\$Regexp::Grammars::RESULT_STACK[-1]{'!'})//0; delete(\$Regexp::Grammars::RESULT_STACK[-1]{'\@'}); }) )) $implicit_version ) }; } # Locate any valid <...> sequences and replace with native regex code... sub _translate_subrule_calls { my ($source_file, $source_line, $grammar_name, $grammar_spec, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $pre_match_debug, $post_match_debug, $rule_name, $subrule_names_ref, $magic_ws, ) = @_; my $pretty_rule_name = $rule_name ? ($magic_ws ? '" : 'main regex (before first rule)'; # Remember the preceding construct, so as to implement the +% etc. operators... my $prev_construct = q{}; my $prev_translation = q{}; my $curr_line_num = 1; # Translate all other calls (MAIN GRAMMAR FOR MODULE)... $grammar_spec =~ s{ (?{ $curr_line_num = substr($_, 0, pos) =~ tr/\n//; }) (? (? \s*+) (? (?&SEPLIST_OP) ) (? \s*+) )? (? (? \. \s* (?(?&QUALIDENT)) \s* (?(?&ARGLIST)) \s* ) | (? (? \? | \! ) \s* (?(?&QUALIDENT)) \s* (?(?&ARGLIST)) \s* ) | (? \s* (?(?&QUALIDENT)) \s* (?(?&ARGLIST)) \s* ) | (? \[ \s* (?(?&QUALIDENT)) \s* (?(?&ARGLIST)) \s* \] ) | (? (?(?&IDENT)) \s* = \s* (?(?&QUALIDENT)) \s* (?(?&ARGLIST)) \s* ) | (? \[ (?(?&IDENT)) \s* = \s* (?(?&QUALIDENT)) \s* (?(?&ARGLIST)) \s* \] ) | (? \s* : (?(?&QUALIDENT)) \s* ) | (? (?(?&IDENT)) \s* = \s* : (?(?&QUALIDENT)) \s* ) | (? \[ (?(?&IDENT)) \s* = \s* : (?(?&QUALIDENT)) \s* \] ) | (? \. (?(?&IDENT)) \s* = \s* (?(?&PARENCODE)|(?&PARENS)|(?&LITERAL)) \s* ) | (? (?(?&IDENT)) \s* = \s* (?(?&PARENCODE)|(?&PARENS)|(?&LITERAL)) \s* ) | (? \[ (?(?&IDENT)) \s* = \s* (?(?&PARENCODE)|(?&PARENS)|(?&LITERAL)) \s* \] ) | (? (?(?&HASH)) \s* (?(?&BRACES))? \s* ) | (? (?(?&IDENT)) \s* = \s* (?(?&HASH)) \s* (?(?&BRACES))? \s* ) | (? \[ (?(?&IDENT)) \s* = \s* (?(?&HASH)) \s* (?(?&BRACES))? \s* \] ) | (? \s* (? \\ | /) (? [:] (?&QUALIDENT)) \s* | \s* (? \\_ | /) (? (?&QUALIDENT)) \s* ) | (? (?(?&IDENT)) \s* = \s* (? \\ | /) (? [:] (?&QUALIDENT)) \s* | (?(?&IDENT)) \s* = \s* (? \\_ | /) (? (?&QUALIDENT)) \s* ) | (? \[ (?(?&IDENT)) \s* = \s* (? \\ | /) (? [:] (?&QUALIDENT)) \s* \] | \[ (?(?&IDENT)) \s* = \s* (? \\_ | /) (? (?&QUALIDENT)) \s* \] ) | (? minimize \s* : \s* ) | (? require \s* : \s* (? (?&PARENCODE) ) \s* ) | (? debug \s* : \s* (? run | match | step | try | off | on) \s* ) | (? timeout \s* : \s* (? \d+) \s* ) | (? context \s* : \s* ) | (? nocontext \s* : \s* ) | (? [.][.][.] | [!][!][!] | [?][?][?] ) | (? (? error | fatal ) \s*+ : \s*+ ) | (? (? log | error | warning | fatal ) \s*+ : \s*+ (? (?&PARENCODE) | .+? ) \s*+ ) ) > (? \s* (?! (?&SEPLIST_OP) ) [?+*][?+]? | ) | (? $WS_PATTERN ) | (? (?&SEPLIST_OP) \s* (? \S* ) ) | (? \(\?\<\w+\> ) | (? < [^>\n]* [>\n] ) | (? (? (?: \\[^shv] | (?! (?&PARENCODE) ) (?&PARENS) | (?&CHARSET) | \w++ | \| ) (?&QUANTIFIER)? ) | (? \s++ | \\. (?&QUANTIFIER)? | \(\?! | \(\?\# [^)]* \) # (?# -> old style inline comment) | (?&PARENCODE) | \# [^\n]*+ | [^][\s()<>#\\]++ ) ) (?(DEFINE) (? \*\* | [*+?] [+?]?+ \s* %%?+ | \{ \d+(,\d*)? \} [+?]?+ \s* %%?+ ) (? \( (?:[?] (?: <[=!] | [:>] ))? (?: \\. | (?&PARENCODE) | (?&PARENS) | (?&CHARSET) | [^][()\\<>]++ )*+ \) ) (? \{ (?: \\. | (?&BRACES) | [^{}\\]++ )*+ \} ) (? \(\?\??\{ (?: \\. | (?&BRACES) | [^{}\\]++ )*+ \}\) ) (? \% (?&IDENT) (?: :: (?&IDENT) )* ) (? \[ \^?+ \]?+ (?: \[:\w+:\] | \\. | [^]] )*+ \] ) (? [^\W\d]\w*+ ) (? (?: [^\W\d]\w*+ :: )* [^\W\d]\w*+ ) (? (?&NUMBER) | (?&STRING) | (?&VAR) ) (? [+-]? \d++ (?:\. \d++)? (?:[eE] [+-]? \d++)? ) (? ' [^\\']++ (?: \\. [^\\']++ )* ' ) (? (?&PARENCODE) | \( \s* (?&ARGS)? \s* \) | (?# NOTHING ) ) (? (?&ARG) \s* (?: , \s* (?&ARG) \s* )* ,? ) (? (?&VAR) | (?&KEY) \s* => \s* (?&LITERAL) ) (? : (?&IDENT) ) (? (?&IDENT) | (?&LITERAL) ) (? [*+?][+?]? | \{ \d+,?\d* \} [+?]? ) ) }{ my $curr_construct = $+{construct}; my $list_marker = $+{list_marker} // q{}; my $alias = ($+{alias}//'MATCH') eq 'MATCH' ? q{'='} : qq{'$+{alias}'}; # Determine and remember the necessary translation... my $curr_translation = do{ # Translate subrule calls of the form: ... if (defined $+{alias_parens_scalar}) { my $pattern = substr($+{pattern},0,1) eq '(' ? $+{pattern} : "(?{$+{pattern}})"; _translate_subpattern( $curr_construct, $alias, $pattern, 'scalar', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, ); } elsif (defined $+{alias_parens_scalar_nocap}) { my $pattern = substr($+{pattern},0,1) eq '(' ? $+{pattern} : "(?{$+{pattern}})"; _translate_subpattern( $curr_construct, $alias, $pattern, 'noncapturing', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, ); } elsif (defined $+{alias_parens_list}) { my $pattern = substr($+{pattern},0,1) eq '(' ? $+{pattern} : "(?{$+{pattern}})"; _translate_subpattern( $curr_construct, $alias, $pattern, 'list', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, ); } # Translate subrule calls of the form: ... elsif (defined $+{alias_hash_scalar}) { _translate_hashmatch( $curr_construct, $alias, $+{varname}, $+{keypat}, 'scalar', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, ); } elsif (defined $+{alias_hash_scalar_nocap}) { _translate_hashmatch( $curr_construct, $alias, $+{varname}, $+{keypat}, 'noncapturing', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, ); } elsif (defined $+{alias_hash_list}) { _translate_hashmatch( $curr_construct, $alias, $+{varname}, $+{keypat}, 'list', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, ); } # Translate subrule calls of the form: ... elsif (defined $+{alias_subrule_scalar}) { _translate_subrule_call( $source_line, $source_file, $pretty_rule_name, $grammar_name, $curr_construct, $alias, $+{subrule}, $+{args}, 'scalar', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $subrule_names_ref, ); } elsif (defined $+{alias_subrule_list}) { _translate_subrule_call( $source_line, $source_file, $pretty_rule_name, $grammar_name, $curr_construct, $alias, $+{subrule}, $+{args}, 'list', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $subrule_names_ref, ); } # Translate subrule calls of the form: and ... elsif (defined $+{self_subrule_lookahead}) { # Determine type of lookahead, and work around capture problem... my ($type, $pre, $post) = ( 'neglookahead', '(?!(?!)|', ')' ); if ($+{sign} eq '?') { $type = 'poslookahead'; $pre x= 2; $post x= 2; } $pre . _translate_subrule_call( $source_line, $source_file, $pretty_rule_name, $grammar_name, $curr_construct, qq{'$+{subrule}'}, $+{subrule}, $+{args}, $type, q{}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $subrule_names_ref, ) . $post; } elsif (defined $+{self_subrule_scalar_nocap}) { _translate_subrule_call( $source_line, $source_file, $pretty_rule_name, $grammar_name, $curr_construct, qq{'$+{subrule}'}, $+{subrule}, $+{args}, 'noncapturing', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $subrule_names_ref, ); } elsif (defined $+{self_subrule_scalar}) { _translate_subrule_call( $source_line, $source_file, $pretty_rule_name, $grammar_name, $curr_construct, qq{'$+{subrule}'}, $+{subrule}, $+{args}, 'scalar', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $subrule_names_ref, ); } elsif (defined $+{self_subrule_list}) { _translate_subrule_call( $source_line, $source_file, $pretty_rule_name, $grammar_name, $curr_construct, qq{'$+{subrule}'}, $+{subrule}, $+{args}, 'list', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $subrule_names_ref, ); } # Translate subrule calls of the form: ... elsif (defined $+{alias_argrule_scalar}) { my $pattern = qq{(??{;\$Regexp::Grammars::RESULT_STACK[-1]{'\@'}{'$+{subrule}'} // '(?!)'})}; _translate_subpattern( $curr_construct, $alias, $pattern, 'scalar', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, "in \$ARG{'$+{subrule}'}" ); } elsif (defined $+{alias_argrule_list}) { my $pattern = qq{(??{;\$Regexp::Grammars::RESULT_STACK[-1]{'\@'}{'$+{subrule}'} // '(?!)'})}; _translate_subpattern( $curr_construct, $alias, $pattern, 'list', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, "in \$ARG{'$+{subrule}'}" ); } # Translate subrule calls of the form: <:ARGNAME>... elsif (defined $+{self_argrule_scalar}) { my $pattern = qq{(??{;\$Regexp::Grammars::RESULT_STACK[-1]{'\@'}{'$+{subrule}'} // '(?!)'})}; _translate_subpattern( $curr_construct, qq{'$+{subrule}'}, $pattern, 'noncapturing', $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, "in \$ARG{'$+{subrule}'}" ); } # Translate subrule calls of the form: <\IDENT> or ... elsif (defined $+{backref} || $+{alias_backref} || $+{alias_backref_list}) { # Use "%ARGS" if subrule names starts with a colon... my $subrule = $+{subrule}; if (substr($subrule,0,1) eq ':') { substr($subrule,0,1,"\@'}{'"); } my $backref = qq{\$Regexp::Grammars::RESULT_STACK[-1]{'$subrule'}}; my $quoter = $+{slash} eq '\\' || $+{slash} eq '\\_' ? "quotemeta($backref)" : "Regexp::Grammars::_invert_delim($backref)" ; my $pattern = qq{ (??{ defined $backref ? $quoter : q{(?!)}})}; my $type = $+{backref} ? 'noncapturing' : $+{alias_backref} ? 'scalar' : 'list' ; _translate_subpattern( $curr_construct, $alias, $pattern, $type, $+{modifier}, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, "in \$MATCH{'$subrule'}" ); } # Translate reportable raw regexes (add debugging support)... elsif (defined $+{reportable_raw_regex}) { _translate_raw_regex( $+{reportable_raw_regex}, $compiletime_debugging_requested, $runtime_debugging_requested ); } # Translate too-complex repetition specifications... elsif (defined $+{complex_repetition}) { my ($repetition, $separator) = @+{'complex_repetition', 'complex_separator'}; my ($metaop) = $repetition =~ m{(%%?)}; my $quotedop = quotemeta($metaop); $separator =~ s/\s+/ /g; my $problem = $separator =~ /\S/ ? ["The $separator... separator you specified after the $metaop is too complex", "(Try refactoring it into a single subrule call)", ] : ["No separator was specified after the $metaop", "(Or did you need a $quotedop instead, to match a literal '$metaop'?)", ]; _debug_notify( fatal => "Invalid separation specifier: $metaop", "at line $curr_line_num of $pretty_rule_name", @{$problem}, ); exit(1); } # Translate non-reportable raw regexes (leave as is)... elsif (defined $+{raw_regex}) { # Handle raw % and %% my $raw_regex = $+{raw_regex}; if ($raw_regex =~ / \A %%?+ /x) { _debug_notify( fatal => "Invalid separation specifier: $&", "at line $curr_line_num of $pretty_rule_name", "(Did you forget to put a repetition quantifier before the $&", " or did you need a " . quotemeta($&) . " instead, to match a literal '$&'?)", ); exit(1); } # Handle any other raw regex... _translate_raw_regex( $raw_regex, $compiletime_debugging_requested ); } # Translate directives... elsif (defined $+{require_directive}) { _translate_require_directive( $curr_construct, $+{condition}, $compiletime_debugging_requested ); } elsif (defined $+{minimize_directive}) { _translate_minimize_directive( $curr_construct, $+{condition}, $compiletime_debugging_requested ); } elsif (defined $+{debug_directive}) { _translate_debug_directive( $curr_construct, $+{cmd}, $compiletime_debugging_requested ); } elsif (defined $+{timeout_directive}) { _translate_timeout_directive( $curr_construct, $+{timeout}, $compiletime_debugging_requested ); } elsif (defined $+{error_directive}) { _translate_error_directive( $curr_construct, $+{error_type}, $+{msg}, $compiletime_debugging_requested, $rule_name ); } elsif (defined $+{autoerror_directive}) { _translate_error_directive( $curr_construct, $+{error_type}, q{}, $compiletime_debugging_requested, $rule_name ); } elsif (defined $+{yadaerror_directive}) { _translate_error_directive( $curr_construct, ($+{yadaerror_directive} eq '???' ? 'warning' : 'error'), q{}, $compiletime_debugging_requested, -$rule_name ); } elsif (defined $+{context_directive}) { if ($compiletime_debugging_requested) { _debug_notify( info => " |", " |...Treating $curr_construct as:", " | \\ Turn on context-saving for the current rule" ); } q{}; # Remove the directive } elsif (defined $+{nocontext_directive}) { if ($compiletime_debugging_requested) { _debug_notify( info => " |", " |...Treating $curr_construct as:", " | \\ Turn off context-saving for the current rule" ); } q{}; # Remove the directive } elsif (defined $+{ws_directive}) { if ($compiletime_debugging_requested) { _debug_notify( info => " |", " |...Treating $curr_construct as:", " | \\ Change whitespace matching for the current rule" ); } $curr_construct; } # Something that looks like a rule call or directive, but isn't... elsif (defined $+{incomplete_request}) { my $request = $+{incomplete_request}; $request =~ s/\n//g; if ($request =~ /\A\s*<\s*\Z/) { _debug_notify( fatal => qq{Invalid < metacharacter near line $curr_line_num of $pretty_rule_name}, qq{If you meant to match a literal '<', use: \\<}, ); } else { _debug_notify( fatal => qq{Possible failed attempt to specify}, qq{a subrule call or directive: $request}, qq{near line $curr_line_num of $pretty_rule_name}, qq{If you meant to match literally, use: \\$request}, ); } exit(1); } # A quantifier that isn't quantifying anything... elsif (defined $+{loose_quantifier}) { my $quant = $+{loose_quantifier}; $quant =~ s{^\s+}{}; my $literal = quotemeta($quant); _debug_notify( fatal => qq{Quantifier that doesn't quantify anything: $quant}, qq{at line $curr_line_num in declaration of $pretty_rule_name}, qq{(Did you mean to match literally? If so, try: $literal)}, q{}, ); exit(1); } # There shouldn't be any other possibility... else { die qq{Internal error: this shouldn't happen!\n}, qq{Near '$curr_construct' at $curr_line_num of $pretty_rule_name\n}; } }; # Handle the **/*%/*%%/+%/{n,m}%/etc operators... if ($list_marker) { my $ws = $magic_ws ? $+{ws1} . $+{ws2} : q{}; my $op = $+{op}; $curr_translation = _translate_separated_list( $prev_construct, $op, $curr_construct, $prev_translation, $curr_translation, $ws, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, ); $curr_construct = qq{$prev_construct $op $curr_construct}; } # Finally, remember this latest translation, and return it... $prev_construct = $curr_construct; $prev_translation = $curr_translation;; }exmsg; # Translate magic hash accesses... $grammar_spec =~ s{\$(?:\:\:)?MATCH (?= \s*\{) } {\$Regexp::Grammars::RESULT_STACK[-1]}xmsg; $grammar_spec =~ s{\$(?:\:\:)?ARG (?= \s*\{) } {\$Regexp::Grammars::RESULT_STACK[-1]{'\@'}}xmsg; # Translate magic scalars and hashes... state $translate_scalar = { q{%$MATCH} => q{%{$Regexp::Grammars::RESULT_STACK[-1]{q{=}}}}, q{@$MATCH} => q{@{$Regexp::Grammars::RESULT_STACK[-1]{q{=}}}}, q{$MATCH} => q{$Regexp::Grammars::RESULT_STACK[-1]{q{=}}}, q{%MATCH} => q{%{$Regexp::Grammars::RESULT_STACK[-1]}}, q{$CAPTURE} => q{$^N}, q{$CONTEXT} => q{$^N}, q{$DEBUG} => q{$Regexp::Grammars::DEBUG}, q{$INDEX} => q{${\\pos()}}, q{%ARG} => q{%{$Regexp::Grammars::RESULT_STACK[-1]{'@'}}}, q{%$::MATCH} => q{%{$Regexp::Grammars::RESULT_STACK[-1]{q{=}}}}, q{@$::MATCH} => q{@{$Regexp::Grammars::RESULT_STACK[-1]{q{=}}}}, q{$::MATCH} => q{$Regexp::Grammars::RESULT_STACK[-1]{q{=}}}, q{%::MATCH} => q{%{$Regexp::Grammars::RESULT_STACK[-1]}}, q{$::CAPTURE} => q{$^N}, q{$::CONTEXT} => q{$^N}, q{$::DEBUG} => q{$Regexp::Grammars::DEBUG}, q{$::INDEX} => q{${\\pos()}}, q{%::ARG} => q{%{$Regexp::Grammars::RESULT_STACK[-1]{'@'}}}, }; state $translatable_scalar = join '|', map {quotemeta $_} sort {length $b <=> length $a} keys %{$translate_scalar}; $grammar_spec =~ s{ ($translatable_scalar) (?! \s* (?: \[ | \{) ) } {$translate_scalar->{$1}}oxmsg; return $grammar_spec; } # Generate a "decimal timestamp" and insert in a template... sub _timestamp { my ($template) = @_; # Generate and insert any timestamp... if ($template =~ /%t/) { my ($sec, $min, $hour, $day, $mon, $year) = localtime; $mon++; $year+=1900; my $timestamp = sprintf("%04d%02d%02d.%02d%02d%02d", $year, $mon, $day, $hour, $min, $sec); $template =~ s{%t}{$timestamp}xms;; } return $template; } # Open (or re-open) the requested log file... sub _autoflush { my ($fh) = @_; my $originally_selected = select $fh; $|=1; select $originally_selected; } sub _open_log { my ($mode, $filename, $from_where) = @_; $from_where //= q{}; # Special case: '-' --> STDERR if ($filename eq q{-}) { return *STDERR{IO}; } # Otherwise, just open the named file... elsif (open my $fh, $mode, $filename) { _autoflush($fh); return $fh; } # Otherwise, generate a warning and default to STDERR... else { local *Regexp::Grammars::LOGFILE = *STDERR{IO}; _debug_notify( warn => qq{Unable to open log file '$filename'}, ($from_where ? $from_where : ()), qq{($!)}, qq{Defaulting to STDERR instead.}, q{}, ); return *STDERR{IO}; } } sub _invert_delim { my ($delim) = @_; $delim = reverse $delim; $delim =~ tr/<>[]{}()??`'/><][}{)(??'`/; return quotemeta $delim; } # Regex to detect if other regexes contain a grammar specification... my $GRAMMAR_DIRECTIVE = qr{ < grammar: \s* (? $QUALIDENT ) \s* > }xms; # Regex to detect if other regexes contain a grammar inheritance... my $EXTENDS_DIRECTIVE = qr{ < extends: \s* (? $QUALIDENT ) \s* > }xms; # Cache of rule/token names within defined grammars... my %subrule_names_for; # Build list of ancestors for a given grammar... sub _ancestry_of { my ($grammar_name) = @_; return () if !$grammar_name; use mro; return map { substr($_, $CACHE_LEN) } @{mro::get_linear_isa($CACHE.$grammar_name, 'c3')}; } # Detect and translate any requested grammar inheritances... sub _extract_inheritances { my ($source_line, $source_file, $regex, $compiletime_debugging_requested, $derived_grammar_name) = @_; # Detect and remove inheritance requests... while ($regex =~ s{$EXTENDS_DIRECTIVE}{}xms) { # Normalize grammar name and report... my $orig_grammar_name = $+{base_grammar_name}; my $grammar_name = $orig_grammar_name; if ($grammar_name !~ /::/) { $grammar_name = caller(2).'::'.$grammar_name; } if (exists $user_defined_grammar{$grammar_name}) { if ($compiletime_debugging_requested) { _debug_notify( info => "Processing inheritance request for $grammar_name...", q{}, ); } # Specify new relationship... no strict 'refs'; push @{$CACHE.$derived_grammar_name.'::ISA'}, $CACHE.$grammar_name; } else { _debug_notify( fatal => "Inheritance from unknown grammar requested", "by directive", "in regex grammar declared at $source_file line $source_line", q{}, ); exit(1); } } # Retrieve ancestors (but not self) in C3 dispatch order... my (undef, @ancestors) = _ancestry_of($derived_grammar_name); # Extract subrule names and implementations for ancestors... my %subrule_names = map { %{$subrule_names_for{$_}} } @ancestors; $_ = -1 for values %subrule_names; my $implementation = join "\n", map { $user_defined_grammar{$_} } @ancestors; return $implementation, \%subrule_names; } # Transform grammar-augmented regex into pure Perl 5.10 regex... sub _build_grammar { my ($grammar_spec) = @_; $grammar_spec .= q{}; # Check for lack of Regexp::Grammar-y constructs and short-circuit... if ($grammar_spec !~ m{ < (?: [.?![:%\\/]? [^\W\d]\w* [^>]* | [.?!]{3} ) > }xms) { return $grammar_spec; } # Remember where we parked... my ($source_file, $source_line) = (caller 1)[1,2]; $source_line -= $grammar_spec =~ tr/\n//; # Check for dubious repeated constructs that throw away captures... my $dubious_line = $source_line; while ($grammar_spec =~ m{ (.*?) ( < (?! \[ ) # not <[SUBRULE]> ( $IDENT (?: = [^>]*)? ) # but or > \s* ( # followed by a quantifier... [+*][?+]? # either symbolic | \{\d+(?:,\d*)?\}[?+]? # or numeric ) ) }gxms) { my ($prefix, $match, $rule, $qual) = ($1, $2, $3, $4); $dubious_line += $prefix =~ tr/\n//; _debug_notify( warn => qq{Repeated subrule <$rule>$qual}, qq{at $source_file line $dubious_line}, qq{will only capture its final match}, qq{(Did you mean <[$rule]>$qual instead?)}, q{}, ); $dubious_line += $match =~ tr/\n//; } # Check for dubious non-backtracking constructs... $dubious_line = $source_line; while ( $grammar_spec =~ m{ (.*?) ( < (?! (?:obj)? (?:rule: | token ) ) ( [^>]+ ) > \s* ( [?+*][+] | \{.*\}[+] ) ) }gxms) { my ($prefix, $match, $rule, $qual) = ($1, $2, $3, $4); $dubious_line += $prefix =~ tr/\n//; my $safe_qual = substr($qual,0,-1); _debug_notify( warn => qq{Non-backtracking subrule call <$rule>$qual}, qq{at $source_file line $dubious_line}, qq{may not revert correctly during backtracking.}, qq{(If grammar does not work, try <$rule>$safe_qual instead)}, q{}, ); $dubious_line += $match =~ tr/\n//; } # Check whether a log file was specified... my $compiletime_debugging_requested; local *Regexp::Grammars::LOGFILE = *Regexp::Grammars::LOGFILE; my $logfile = q{-}; my $log_where = "for regex grammar defined at $source_file line $source_line"; $grammar_spec =~ s{ ^ [^#]* < logfile: \s* ([^>]+?) \s* > }{ $logfile = _timestamp($1); # Presence of implies compile-time logging... $compiletime_debugging_requested = 1; *Regexp::Grammars::LOGFILE = _open_log('>',$logfile, $log_where ); # Delete directive... q{}; }gexms; # Look ahead for any run-time debugging or timeout requests... my $runtime_debugging_requested = $grammar_spec =~ m{ ^ [^#]* < debug: \s* (run | match | step | try | on | same ) \s* > | \$DEBUG (?! \s* (?: \[ | \{) ) }xms; my $timeout_requested = $grammar_spec =~ m{ ^ [^#]* < timeout: \s* \d+ \s* > }xms; # Standard actions set up and clean up any regex debugging... # Before entire match, set up a stack of attempt records and report... my $pre_match_debug = $runtime_debugging_requested ? qq{(?{; *Regexp::Grammars::LOGFILE = Regexp::Grammars::_open_log('>>','$logfile', '$log_where'); Regexp::Grammars::_init_try_stack(); })} : qq{(?{; *Regexp::Grammars::LOGFILE = Regexp::Grammars::_open_log('>>','$logfile', '$log_where'); })} ; # After entire match, report whether successful or not... my $post_match_debug = $runtime_debugging_requested ? qq{(?{;Regexp::Grammars::_debug_matched(0,\\%/,'',\$^N)}) |(?>(?{;Regexp::Grammars::_debug_handle_failures(0,''); }) (?!)) } : q{} ; # Remove comment lines... $grammar_spec =~ s{^ ([^#\n]*) \s \# [^\n]* }{$1}gxms; # Subdivide into rule and token definitions, preparing to process each... # REWRITE THIS, USING (PROBABLY NEED TO REFACTOR ALL GRAMMARS TO REUSe # THESE COMPONENTS: # (? \( \s* (?&PARAMS)? \s* \) | (?# NOTHING ) ) # (? (?&PARAM) \s* (?: , \s* (?&PARAM) \s* )* ,? ) # (? (?&VAR) (?: \s* = \s* (?: (?&LITERAL) | (?&PARENCODE) ) )? ) # (? (?&NUMBER) | (?&STRING) | (?&VAR) ) # (? : (?&IDENT) ) my @defns = split m{ (< (obj|)(rule|token) \s*+ : \s*+ ((?:${IDENT}::)*+) (?: ($IDENT) \s*+ = \s*+ )?+ ($IDENT) \s* >) }xms, $grammar_spec; # Extract up list of names of defined rules/tokens... # (Name is every 6th item out of every seven, skipping the first item) my @subrule_names = @defns[ map { $_ * 7 + 6 } 0 .. ((@defns-1)/7-1) ]; my @defns_copy = @defns[1..$#defns]; my %subrule_names; # Build a look-up table of subrule names, checking for duplicates... my $defn_line = $source_line + $defns[0] =~ tr/\n//; my %first_decl_explanation; for my $subrule_name (@subrule_names) { my ($full_decl, $objectify, $type, $qualifier, $name, $callname, $body) = splice(@defns_copy, 0, 7); if (++$subrule_names{$subrule_name} > 1) { _debug_notify( warn => "Redeclaration of <$objectify$type: $subrule_name>", "at $source_file line $defn_line", "will be ignored.", @{ $first_decl_explanation{$subrule_name} }, q{}, ); } else { $first_decl_explanation{$subrule_name} = [ "(Hidden by the earlier declaration of <$objectify$type: $subrule_name>", " at $source_file line $defn_line)" ]; } $defn_line += ($full_decl.$body) =~ tr/\n//; } # Add the built-ins... @subrule_names{'ws', 'hk', 'matchpos', 'matchline'} = (1) x 4; # An empty main rule will never match anything... my $main_regex = shift @defns; if ($main_regex =~ m{\A (?: \s++ | \(\?\# [^)]* \) | \# [^\n]++ )* \z}xms) { _debug_notify( error => "No main regex specified before rule definitions", "in regex grammar declared at $source_file line $source_line", "Grammar will never match anything.", "(Or did you forget a specification?)", q{}, ); } # Compile the regex or grammar... my $regex = q{}; my $grammar_name; my $is_grammar; # Is this a grammar specification? if ($main_regex =~ $GRAMMAR_DIRECTIVE) { # Normalize grammar name and report... $grammar_name = $+{grammar_name}; if ($grammar_name !~ /::/) { $grammar_name = caller(1) . "::$grammar_name"; } $is_grammar = 1; # Add subrule definitions to namespace... for my $subrule_name (@subrule_names) { $CACHE{$grammar_name.'::'.$subrule_name} = 1; } } else { state $dummy_grammar_index = 0; $grammar_name = '______' . $dummy_grammar_index++; } # Extract any inheritance information... my ($inherited_rules, $inherited_subrule_names) = _extract_inheritances( $source_line, $source_file, $main_regex, $compiletime_debugging_requested, $grammar_name ); # Remove requests... $main_regex =~ s{ $EXTENDS_DIRECTIVE }{}gxms; # Add inherited subrule names to allowed subrule names; @subrule_names{ keys %{$inherited_subrule_names} } = values %{$inherited_subrule_names}; # Remove comments from top-level grammar... $main_regex =~ s{ \(\?\# [^)]* \) | (? }{}gxms) ? 1 : ($main_regex =~ s{ < context \s* : \s* > }{}gxms) ? 0 : 0; # If so, set up to save the grammar... if ($is_grammar) { # Normalize grammar name and report... if ($grammar_name !~ /::/) { $grammar_name = caller(1) . "::$grammar_name"; } if ($compiletime_debugging_requested) { _debug_notify( info => "Processing definition of grammar $grammar_name...", q{}, ); } # Remove the grammar directive... $main_regex =~ s{ ( $GRAMMAR_DIRECTIVE | < debug: \s* (run | match | step | try | on | off | same ) \s* > ) }{$source_line += $1 =~ tr/\n//; q{}}gexms; # Check for anything else in the main regex... if ($main_regex =~ /\A(\s*)\S/) { $source_line += $1 =~ tr/\n//; _debug_notify( warn => "Unexpected item before first subrule specification", "in definition of ", "at $source_file line $source_line:", map({ " $_"} grep /\S/, split "\n", $main_regex), "(this will be ignored when defining the grammar)", q{}, ); } # Remember set of valid subrule names... $subrule_names_for{$grammar_name} = { map({ ($_ => 1) } keys %subrule_names), map({ ($grammar_name.'::'.$_ => 1) } grep { !/::/ } keys %subrule_names), }; } else { #...not a grammar specification # Report how main regex was interpreted, if requested to... if ($compiletime_debugging_requested) { _debug_notify( info => "Processing the main regex before any rule definitions", ); } # Any actual regex is processed first... $regex = _translate_subrule_calls( $source_file, $source_line, $grammar_name, $main_regex, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $pre_match_debug, $post_match_debug, q{}, # Expected...what? \%subrule_names, 0, # Whitespace isn't magical ); # Wrap the main regex (to ensure |'s don't segment pre and post commands)... $regex = "(?:$regex)"; # Report how construct was interpreted, if requested to... if ($compiletime_debugging_requested) { _debug_notify( q{} => q{ |}, q{ \\___End of main regex}, q{}, ); } } # Update line number... $source_line += $main_regex =~ tr/\n//; # Then iterate any following rule definitions... while (@defns) { # Grab details of each rule defn (as extracted by previous split)... my ($full_decl, $objectify, $type, $qualifier, $name, $callname, $body) = splice(@defns, 0, 7); $name //= $callname; my $qualified_name = $grammar_name.'::'.$callname; # Report how construct was interpreted, if requested to... if ($compiletime_debugging_requested) { _debug_notify( info => "Defining a $type: <$callname>", " |...Returns: " . ($objectify ? "an object of class '$qualifier$name'" : "a hash"), ); } my $local_nocontext = ($body =~ s{ < nocontext \s* : \s* > }{}gxms) ? 1 : ($body =~ s{ < context \s* : \s* > }{}gxms) ? 0 : $nocontext; # Translate any nested <...> constructs... my $trans_body = _translate_subrule_calls( $source_file, $source_line, $grammar_name, $body, $compiletime_debugging_requested, $runtime_debugging_requested, $timeout_requested, $pre_match_debug, $post_match_debug, $callname, # Expected...what? \%subrule_names, $type eq 'rule', # Is whitespace magical? ); # Report how construct was interpreted, if requested to... if ($compiletime_debugging_requested) { _debug_notify( q{} => q{ |}, q{ \\___End of rule definition}, q{}, ); } # Make allowance for possible local whitespace definitions... my $local_ws_defn = q{}; my $local_ws_call = q{(?&ws__implicit__)}; # Rules make non-code literal whitespace match textual whitespace... if ($type eq 'rule') { # Implement any local whitespace definition... my $first_ws = 1; WS_DIRECTIVE: while ($trans_body =~ s{$WS_PATTERN}{}oxms) { my $defn = $1; if ($defn !~ m{\S}xms) { _debug_notify( warn => qq{Ignoring useless empty directive}, qq{in definition of }, qq{near $source_file line $source_line}, qq{(Did you mean instead?)}, q{}, ); next WS_DIRECTIVE; } elsif (!$first_ws) { _debug_notify( warn => qq{Ignoring useless extra directive}, qq{in definition of }, qq{at $source_file line $source_line}, qq{(No more than one is permitted per rule!)}, q{}, ); next WS_DIRECTIVE; } else { $first_ws = 0; } state $ws_counter = 0; $ws_counter++; $local_ws_defn = qq{(?<__RG_ws_$ws_counter> $defn)}; $local_ws_call = qq{(?&__RG_ws_$ws_counter)}; } # Implement auto-whitespace... state $CODE_OR_SPACE = qr{ # TODO: REWORK THIS INSUFFICENT FIX FOR t/grammar_autospace.t... # # (? \(\?: \s++ \) ) # Explicitly walled off space is magic # | (? # These are not magic... \( \?\?? (?&BRACED) \) # Embedded code blocks | \s++ # Whitespace followed by... (?= \| # ...an OR | \(\?\#\) # ...a null comment | (?: \) \s* )? \z # ...the end of the rule | \(\(?\?\&ws\) # ...an explicit ws match | \(\?\??\{ # ...an embedded code block | \\[shv] # ...an explicit space match ) ) | (? \s++ ) # All other whitespace is magic (?(DEFINE) (? \{ (?: \\. | (?&BRACED) | [^{}] )* \} ) ) }xms; $trans_body =~ s{($CODE_OR_SPACE)}{ $+{ignorable_space} // $local_ws_call }exmsg; } else { while ($trans_body =~ s{$WS_PATTERN}{}oxms) { _debug_notify( warn => qq{Ignoring useless directive}, qq{in definition of }, qq{at $source_file line $source_line}, qq{(Did you need to define instead of ?)}, q{}, ); } } $regex .= "\n###############[ $source_file line $source_line ]###############\n" . _translate_rule_def( $type, $qualifier, $name, $callname, $qualified_name, $trans_body, $objectify, $local_ws_defn, $local_nocontext, ); # Update line number... $source_line += ($full_decl.$body) =~ tr/\n//; } # Insert checkpoints into any user-defined code block... $regex =~ s{ \( \?\?? \{ \K (?!;) }{ local \@Regexp::Grammars::RESULT_STACK = \@Regexp::Grammars::RESULT_STACK; }xmsg; # Check for any suspicious left-overs from the start of the regex... pos $regex = 0; # If a grammar definition, save grammar and return a placeholder... if ($is_grammar) { $user_defined_grammar{$grammar_name} = $regex; return qq{(?{ warn "Can't match directly against a pure grammar: \n"; })(*COMMIT)(?!)}; } # Otherwise, aggregrate the final grammar... else { return _complete_regex($regex.$inherited_rules, $pre_match_debug, $post_match_debug, $nocontext); } } sub _complete_regex { my ($regex, $pre_match_debug, $post_match_debug, $nocontext) = @_; return $nocontext ? qq{(?x)$pre_match_debug$PROLOGUE$regex$EPILOGUE_NC$post_match_debug} : qq{(?x)$pre_match_debug$PROLOGUE$regex$EPILOGUE$post_match_debug}; } 1; # Magic true value required at end of module __END__ =head1 NAME Regexp::Grammars - Add grammatical parsing features to Perl 5.10 regexes =head1 VERSION This document describes Regexp::Grammars version 1.058 =head1 SYNOPSIS use Regexp::Grammars; my $parser = qr{ (?: # Parse and save a Verb in a scalar <.ws> # Parse but don't save whitespace # Parse and save a Noun in a scalar 0.5 ? 'VN' : 'VerbNoun' })> # Save result of expression in a scalar | (?: <[Noun]> # Parse a Noun and save result in a list (saved under the key 'Noun') <[PostNoun=ws]> # Parse whitespace, save it in a list # (saved under the key 'PostNoun') )+ # Parse a Verb and save result in a scalar (saved under the key 'Verb') # Save a literal in a scalar | # Turn on the integrated debugger here <.Cmd= (?: mv? )> # Parse but don't capture a subpattern (name it 'Cmd' for debugging purposes) <[File]>+ # Parse 1+ Files and save them in a list (saved under the key 'File') # Turn off the integrated debugger here # Parse a File and save it in a scalar (saved under the key 'Dest') ) ################################################################ # Define a subrule named File <.ws> # - Parse but don't capture whitespace # - Parse the subpattern and capture # matched text as the result of the # subrule # Define a subrule named Noun cat | dog | fish # - Match an alternative (as usual) # Define a whitespace-sensitive subrule eats # - Match a literal (after any space) ? # - Parse optional subrule Noun and # save result under the key 'Object' | # Or else... # - Parse subrule AUX and save result # - Match a literal, save under 'part' # Define a whitespace-insensitive subrule (has | is) # - Match an alternative and capture (?{ $MATCH = uc $^N }) # - Use captured text as subrule result }x; # Match the grammar against some text... if ($text =~ $parser) { # If successful, the hash %/ will have the hierarchy of results... process_data_in( %/ ); } =head1 QUICKSTART CHEATSHEET =head2 In your program... use Regexp::Grammars; Allow enhanced regexes in lexical scope %/ Result-hash for successful grammar match =head2 Defining and using named grammars... Define a named grammar that can be inherited Current grammar inherits named grammar's rules =head2 Defining rules in your grammar... Define rule with magic whitespace Define rule without magic whitespace Define rule that blesses return-hash into class Define token that blesses return-hash into class Shortcut for above (rule name derived from class) Shortcut for above (token name derived from class) =head2 Matching rules in your grammar... Call named subrule (may be fully qualified) save result to $MATCH{RULENAME} Call named subrule, passing args to it Call subrule and fail if it matches (shorthand for (?!<.RULENAME>) ) <:IDENT> Match contents of $ARG{IDENT} as a pattern <\:IDENT> Match contents of $ARG{IDENT} as a literal Match closing delimiter for $ARG{IDENT} <%HASH> Match longest possible key of hash <%HASH {PAT}> Match any key of hash that also matches PAT Match closing delimiter for $MATCH{IDENT} <\_IDENT> Match the literal contents of $MATCH{IDENT} Call subrule, save result in $MATCH{ALIAS} Match a hash key, save key in $MATCH{ALIAS} Match pattern, save match in $MATCH{ALIAS} Execute code, save value in $MATCH{ALIAS} Save specified string in $MATCH{ALIAS} Save specified number in $MATCH{ALIAS} Match closing delim, save as $MATCH{ALIAS} Match '$MATCH{IDENT}', save as $MATCH{ALIAS} <.SUBRULE> Call subrule (one of the above forms), but don't save the result in %MATCH <[SUBRULE]> Call subrule (one of the above forms), but append result instead of overwriting it + % Match one or more repetitions of SUBRULE1 as long as they're separated by SUBRULE2 ** Same (only for backwards compatibility) * % Match zero or more repetitions of SUBRULE1 as long as they're separated by SUBRULE2 * %% Match zero or more repetitions of SUBRULE1 as long as they're separated by SUBRULE2 and allow an optional trailing SUBRULE2 =head2 In your grammar's code blocks... $CAPTURE Alias for $^N (the most recent paren capture) $CONTEXT Another alias for $^N $INDEX Current index of next matching position in string %MATCH Current rule's result-hash $MATCH Magic override value (returned instead of result-hash) %ARG Current rule's argument hash $DEBUG Current match-time debugging mode =head2 Directives... Fail if code evaluates false Fail after specified number of seconds Change match-time debugging mode Change debugging log file (default: STDERR) Queue error message and fail parse Queue error message and backtrack Queue warning message and continue Explicitly add a message to debugging log Override automatic whitespace matching Simplify the result of a subrule match Switch on context substring retention Switch off context substring retention =head1 DESCRIPTION This module adds a small number of new regex constructs that can be used within Perl 5.10 patterns to implement complete recursive-descent parsing. Perl 5.10 already supports recursive-descent I, via the new C<< (?...) >> and C<< (?&name) >> constructs. For example, here is a simple matcher for a subset of the LaTeX markup language: $matcher = qr{ (?&File) (?(DEFINE) (? (?&Element)* ) (? \s* (?&Command) | \s* (?&Literal) ) (? \\ \s* (?&Literal) \s* (?&Options)? \s* (?&Args)? ) (? \[ \s* (?:(?&Option) (?:\s*,\s* (?&Option) )*)? \s* \]) (? \{ \s* (?&Element)* \s* \} ) (?