pos(); $close_delimiter_match = null; while($stack) { $next = $this->get_next($patterns); if ($next[0] === -1) { $this->terminate(); $finish = $this->pos(); break; } elseif($balanced && $next[1][2] === $delimiter) { $stack++; $finish = $next[0] + strlen($next[1][0]); } elseif($next[1][2] === $close) { $stack--; if (!$stack) $close_delimiter_match = $next[1][2]; $finish = $next[0] + strlen($next[1][1]); } else assert(0); $this->pos($next[0] + strlen($next[1][0])); } $substr = substr($this->string(), $start, $finish-$start); // special case for qw, the string is not a 'STRING', it is actually // a whitespace separated list of strings. So we need to split it and // record them separately if ($type === 'SPLIT_STRING') { foreach(preg_split('/(\s+)/', $substr, -1, PREG_SPLIT_DELIM_CAPTURE) as $token) { if (preg_match('/^\s/', $token)) { $this->record($token, null); } else { $this->record($token, 'STRING'); } } } else { $this->record($substr, $type); } if ($close_delimiter_match !== null) { $this->record($close_delimiter_match, 'DELIMITER'); } } // Helper function: guesses whether or not a slash is a regex delimiter // by looking behind in the token stream. function is_delimiter() { for($i = count($this->tokens) - 1; $i >= 0; $i--) { $t = $this->tokens[$i]; if ($t[0] === null || $t[0] === 'COMMENT') continue; elseif ($t[0] === 'OPENER' || $t[0] === 'OPERATOR') return true; elseif ($t[0] === 'IDENT') { switch($t[1]) { // named operators case 'lt': case 'gt': case 'le': case 'ge': case 'eq': case 'ne': case 'cmp': case 'and': case 'or': case 'xor': // other keywords/functions case 'if': case 'elsif': case 'while': case 'unless': case 'split': case 'print': return true; } } return false; } return true; } // override function for slashes, to disambiguate regexen from division // operators. function slash_override($matches) { $this->pos( $this->pos() + strlen($matches[0]) ); // this can catch '//', which I THINK is an operator but I could be wrong. if (strlen($matches[0]) === 2 || !$this->is_delimiter()) { $this->record($matches[0], 'OPERATOR'); } else { $this->record($matches[0], 'DELIMITER'); $this->consume_string($matches[0], 'REGEX'); if ($this->scan('/[cgimosx]+/')) { $this->record($this->match(), 'KEYWORD'); } } } // override function for 'quote-like operators' // e.g. m"hello", m'hello', m/hello/, m(hello), m(he()l()o()) function str_override($matches) { $this->pos( $this->pos() + strlen($matches[0]) ); $this->record($matches[0], 'DELIMITER'); $f = $matches[1]; $type = 'STRING'; if ($f === 'm' || $f === 'qr' || $f === 's' || $f === 'tr' || $f === 'y') $type = 'REGEX'; elseif($f === 'qw') $type = 'SPLIT_STRING'; $this->consume_string($matches[3], $type); if ($f === 's' || $f === 'tr' || $f === 'y') { // s/tr/y take two strings, e.g. s/something/somethingelse/, so we // have to consume the next delimiter (if it exists) and consume the // string, again. // if delims were balanced, there's a new delimiter right here, e.g. // s[something][somethingelse] $this->skip_whitespace(); $balanced = LuminousUtils::balance_delimiter($matches[3]) !== $matches[3]; if ($balanced) { $delim2 = $this->scan('/[^a-zA-Z0-9]/'); if ($delim2 !== null) { $this->record($delim2, 'DELIMITER'); $this->consume_string($delim2, 'STRING'); } } // if they weren't balanced then the delimiter is the same, and has // already been consumed as the end-delim to the first pattern else { $this->consume_string($matches[3], 'STRING'); } } if ($type === 'REGEX' && $this->scan('/[cgimosxpe]+/')) { $this->record($this->match(), 'KEYWORD'); } } // this override handles the heredoc declaration, and makes a note of it // it adds a new token (a newline) which is overridden to invoke the real // heredoc handling. This is because in Perl, heredocs declarations need not // be the end of the line so we can't necessarily start heredocing straight // away. function heredoc_override($matches) { list($group, $op, $quote1, $delim, $quote2) = $matches; $this->record($op, 'OPERATOR'); // Now, if $quote1 is '\', then $quote2 is empty. If quote2 is empty // but quote1 is not '\', this is not a heredoc. if ($quote1 === '\\' && $quote2 === '') { $this->record($quote1 . $delim, 'DELIMITER'); } elseif($quote2 === '' && $quote1 !== '') { // this is the error case // shift to the end of the op and break $this->pos_shift(strlen($op)); return; } else { $this->record($quote1 . $delim . $quote2, 'DELIMITER'); } $this->pos_shift(strlen($group)); // TODO. the quotes (matches[2] and matches[4]) are ignored for now, but // they mean something w.r.t interpolation. $this->heredoc = $delim; $this->add_pattern('HEREDOC_NL', "/\n/"); $this->overrides['HEREDOC_NL'] = array($this, 'heredoc_real_override'); } // this override handles the actual heredoc text function heredoc_real_override($matches) { $this->record($matches[0], null); $this->pos_shift(strlen($matches[0])); // don't need this anymore $this->remove_pattern('HEREDOC_NL'); assert($this->heredoc !== null); $delim = preg_quote($this->heredoc); $substr = $this->scan_until('/^' . $delim . '\\b/m'); if ($substr !== null) { $this->record($substr, 'HEREDOC'); $delim_ = $this->scan('/' . $delim . '/'); assert($delim !== null); $this->record($delim_, 'DELIMITER'); } else { $this->record($this->rest(), 'HEREDOC'); $this->terminate(); } } // halts highlighting on __DATA__ and __END__ function term_override($matches) { $this->record($matches[0], 'DELIMITER'); $this->pos( $this->pos() + strlen($matches[0]) ); $this->record($this->rest(), null); $this->terminate(); } // pod cuts might be very long and trigger the backtrack limit, so // we do it the old fashioned way function pod_cut_override($matches) { $line = $this->scan('/^=.*/m'); assert($line !== null); $term = '/^=cut$|\\z/m'; $substr = $this->scan_until($term); assert($substr !== null); $end = $this->scan($term); assert($end !== null); $this->record($line . $substr . $end, 'DOCCOMMENT'); } function init() { $this->add_pattern('COMMENT', '/#.*/'); // pod/cut documentation $this->add_pattern('podcut', '/^=[a-zA-Z_]/m'); $this->overrides['podcut'] = array($this, 'pod_cut_override'); // variables $this->add_pattern('VARIABLE', '/[\\$%@][a-z_]\w*/i'); // special variables http://www.kichwa.com/quik_ref/spec_variables.html $this->add_pattern('VARIABLE', '/\\$[\|%=\-~^\d&`\'+_\.\/\\\\,"#\\$\\?\\*O\\[\\];!@]/'); // `backticks` (shell cmd) $this->add_pattern('CMD', '/`(?: [^`\\\\]++ | \\\\ . )*+ (?:`|$)/x'); // straight strings $this->add_pattern('STRING', LuminousTokenPresets::$DOUBLE_STR); $this->add_pattern('STRING', LuminousTokenPresets::$SINGLE_STR); // terminators $this->add_pattern('TERM', '/__(?:DATA|END)__/'); // heredoc (overriden) $this->add_pattern('HEREDOC', '/(<<)([\'"`\\\\]?)([a-zA-Z_]\w*)(\\2?)/'); // operators, slash is a special case and is overridden $this->add_pattern('OPERATOR', '/[!%^&*\-=+;:|,\\.?<>~\\\\]+/'); $this->add_pattern('SLASH', '%//?%'); // we care about 'openers' for regex-vs-division disambiguatation $this->add_pattern('OPENER', '%[\[\{\(]+%x'); $this->add_pattern('NUMERIC', LuminousTokenPresets::$NUM_HEX); $this->add_pattern('NUMERIC', LuminousTokenPresets::$NUM_REAL); // quote-like operators. we override these. // I got these out of the old luminous tree, I don't know how accurate // or complete they are. // According to psh, delimiters can be escaped? $this->add_pattern('DELIMETERS', '/(q[rqxw]?|m|s|tr|y)([\s]*)(\\\\?[^a-zA-Z0-9\s])/'); $this->add_pattern('IDENT', '/[a-zA-Z_]\w*/'); $this->overrides['DELIMETERS'] = array($this, 'str_override'); $this->overrides['SLASH'] = array($this, 'slash_override'); $this->overrides['HEREDOC'] = array($this, 'heredoc_override'); $this->overrides['TERM'] = array($this, 'term_override'); // map cmd to a 'function' and get rid of openers $this->rule_tag_map = array( 'CMD' => 'FUNCTION', 'OPENER' => null, ); // this sort of borks with the strange regex delimiters $this->remove_filter('pcre'); /************************************************************************/ // data definition follows. // https://www.physiol.ox.ac.uk/Computing/Online_Documentation/Perl-5.8.6/index-functions-by-cat.html $this->add_identifier_mapping('KEYWORD', array( 'bless', 'caller', 'continue', 'dbmclose', 'dbmopen', 'defined', 'delete', 'die', 'do', 'dump', 'else', 'elsif', 'eval', 'exit', 'for', 'foreach', 'goto', 'import', 'if', 'last', 'local', 'my', 'next', 'no', 'our', 'package', 'prototype', 'redo', 'ref', 'reset', 'return', 'require', 'scalar', 'sub', 'tie', 'tied', 'undef', 'utie', 'unless', 'use', 'wantarray', 'while')); $this->add_identifier_mapping('OPERATOR', array('lt', 'gt', 'le', 'ge', 'eq', 'ne', 'cmp', 'and', 'or', 'xor')); $this->add_identifier_mapping('FUNCTION', array( 'chomp', 'chop', 'chr', 'crypt', 'hex', 'index', 'lc', 'lcfirst', 'length', 'oct', 'ord', 'pack', 'reverse', 'rindex', 'sprintf', 'substr', 'uc', 'ucfirst', 'pos', 'quotemeta', 'split', 'study', 'abs', 'atan2', 'cos', 'exp', 'hex', 'int', 'log', 'oct', 'rand', 'sin', 'sqrt', 'srand', 'pop', 'push', 'shift', 'splice', 'unshift', 'grep', 'join', 'map', 'reverse', 'sort', 'unpack', 'delete', 'each', 'exists', 'keys', 'values', 'binmode', 'close', 'closedir', 'dbmclose', 'dbmopen', 'die', 'eof', 'fileno', 'flock', 'format', 'getc', 'print', 'printf', 'read', 'readdir', 'readline', 'rewinddir', 'seek', 'seekdir', 'select', 'syscall', 'sysread', 'sysseek', 'syswrite', 'tell', 'telldir', 'truncate', 'warn', 'write', 'pack', 'read', 'syscall', 'sysread', 'sysseek', 'syswrite', 'unpack', 'vec', 'chdir', 'chmod', 'chown', 'chroot', 'fcntl', 'glob', 'ioctl', 'link', 'lstat', 'mkdir', 'open', 'opendir', 'readlink', 'rename', 'rmdir', 'stat', 'symlink', 'sysopen', 'umask', 'unlink', 'utime', 'alarm', 'exec', 'fork', 'getpgrp', 'getppid', 'getpriority', 'kill', 'pipe', 'qx/STRING/', 'readpipe', 'setpgrp', 'setpriority', 'sleep', 'system', 'times', 'wait', 'waitpid', 'accept', 'bind', 'connect', 'getpeername', 'getsockname', 'getsockopt', 'listen', 'recv', 'send', 'setsockopt', 'shutdown', 'socket', 'socketpair', 'msgctl', 'msgget', 'msgrcv', 'msgsnd', 'semctl', 'semget', 'semop', 'shmctl', 'shmget', 'shmread', 'shmwrite', 'endgrent', 'endhostent', 'endnetent', 'endpwent', 'getgrent', 'getgrgid', 'getgrnam', 'getlogin', 'getpwent', 'getpwnam', 'getpwuid', 'setgrent', 'setpwent', 'endprotoent', 'endservent', 'gethostbyaddr', 'gethostbyname', 'gethostent', 'getnetbyaddr', 'getnetbyname', 'getnetent', 'getprotobyname', 'getprotobynumber', 'getprotoent', 'getservbyname', 'getservbyport', 'getservent', 'sethostent', 'setnetent', 'setprotoent', 'setservent', 'gmtime', 'localtime', 'time', 'times')); } public static function guess_language($src, $info) { // check the shebang if (preg_match('/^#!.*\\bperl\\b/', $src)) return 1.0; $p = 0; if (preg_match('/\\$[a-zA-Z_]+/', $src)) $p += 0.02; if (preg_match('/@[a-zA-Z_]+/', $src)) $p += 0.02; if (preg_match('/%[a-zA-Z_]+/', $src)) $p += 0.02; if (preg_match('/\\bsub\s+\w+\s*\\{/', $src)) $p += 0.1; if (preg_match('/\\bmy\s+[$@%]/', $src)) $p += 0.05; // $x =~ s/ if (preg_match('/\\$[a-zA-Z_]\w*\s+=~\s+s\W/', $src)) $p += 0.15; return $p; } }