mirror of
https://github.com/pierre42100/comunic
synced 2025-07-11 12:32:56 +00:00
First commit
This commit is contained in:
525
3rdparty/luminous/languages/perl.php
vendored
Executable file
525
3rdparty/luminous/languages/perl.php
vendored
Executable file
@ -0,0 +1,525 @@
|
||||
<?php
|
||||
|
||||
/*
|
||||
* Like ruby, I think it's impossible to fully tokenize Perl without
|
||||
* executing some of the code to disambiguate some symbols. As such, we're
|
||||
* going to settle for 'probably right' rather than 'definitely right'.
|
||||
*
|
||||
* TODO: I think this is mostly complete but it needs interpolation
|
||||
* highlighting in strings and heredoc, and a regex highlighting filter,
|
||||
* probably a stream filter
|
||||
*/
|
||||
|
||||
class LuminousPerlScanner extends LuminousSimpleScanner {
|
||||
|
||||
// keeps track of heredocs we need to handle
|
||||
private $heredoc = null;
|
||||
|
||||
// helper function:
|
||||
// consumes a string until the given delimiter (which may be balanced).
|
||||
// will handle nested balanced delimiters.
|
||||
// this is used as the general case for perl quote-operators like:
|
||||
// q/somestring/ q"somestring", q@somestring@, q[some[]string]
|
||||
// it can be called twice for s/someregex/somereplacement/
|
||||
// expects the initial opening delim to already have been consumed
|
||||
function consume_string($delimiter, $type) {
|
||||
$close = LuminousUtils::balance_delimiter($delimiter);
|
||||
|
||||
$balanced = $close !== $delimiter;
|
||||
$patterns = array( '/(?<!\\\\)((?:\\\\\\\\)*)('
|
||||
. preg_quote($close, '/') . ')/');
|
||||
|
||||
if ($balanced) {
|
||||
$patterns[] = '/(?<!\\\\)((?:\\\\\\\\)*)('
|
||||
. preg_quote($delimiter, '/') . ')/';
|
||||
}
|
||||
|
||||
$stack = 1; // we're already inside the string
|
||||
$start = $this->pos();
|
||||
$close_delimiter_match = null;
|
||||
while($stack) {
|
||||
$next = $this->get_next($patterns);
|
||||
if ($next[0] === -1) {
|
||||
$this->terminate();
|
||||
$finish = $this->pos();
|
||||
break;
|
||||
}
|
||||
elseif($balanced && $next[1][2] === $delimiter) {
|
||||
$stack++;
|
||||
$finish = $next[0] + strlen($next[1][0]);
|
||||
}
|
||||
elseif($next[1][2] === $close) {
|
||||
$stack--;
|
||||
if (!$stack)
|
||||
$close_delimiter_match = $next[1][2];
|
||||
$finish = $next[0] + strlen($next[1][1]);
|
||||
}
|
||||
else assert(0);
|
||||
$this->pos($next[0] + strlen($next[1][0]));
|
||||
}
|
||||
$substr = substr($this->string(), $start, $finish-$start);
|
||||
// special case for qw, the string is not a 'STRING', it is actually
|
||||
// a whitespace separated list of strings. So we need to split it and
|
||||
// record them separately
|
||||
if ($type === 'SPLIT_STRING') {
|
||||
foreach(preg_split('/(\s+)/',
|
||||
$substr, -1, PREG_SPLIT_DELIM_CAPTURE) as $token) {
|
||||
if (preg_match('/^\s/', $token)) {
|
||||
$this->record($token, null);
|
||||
} else {
|
||||
$this->record($token, 'STRING');
|
||||
}
|
||||
}
|
||||
} else {
|
||||
$this->record($substr, $type);
|
||||
}
|
||||
if ($close_delimiter_match !== null) {
|
||||
$this->record($close_delimiter_match, 'DELIMITER');
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Helper function: guesses whether or not a slash is a regex delimiter
|
||||
// by looking behind in the token stream.
|
||||
function is_delimiter() {
|
||||
for($i = count($this->tokens) - 1; $i >= 0; $i--) {
|
||||
$t = $this->tokens[$i];
|
||||
if ($t[0] === null || $t[0] === 'COMMENT') continue;
|
||||
elseif ($t[0] === 'OPENER' || $t[0] === 'OPERATOR') return true;
|
||||
elseif ($t[0] === 'IDENT') {
|
||||
switch($t[1]) {
|
||||
// named operators
|
||||
case 'lt':
|
||||
case 'gt':
|
||||
case 'le':
|
||||
case 'ge':
|
||||
case 'eq':
|
||||
case 'ne':
|
||||
case 'cmp':
|
||||
case 'and':
|
||||
case 'or':
|
||||
case 'xor':
|
||||
// other keywords/functions
|
||||
case 'if':
|
||||
case 'elsif':
|
||||
case 'while':
|
||||
case 'unless':
|
||||
case 'split':
|
||||
case 'print':
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// override function for slashes, to disambiguate regexen from division
|
||||
// operators.
|
||||
function slash_override($matches) {
|
||||
$this->pos( $this->pos() + strlen($matches[0]) );
|
||||
// this can catch '//', which I THINK is an operator but I could be wrong.
|
||||
if (strlen($matches[0]) === 2 || !$this->is_delimiter()) {
|
||||
$this->record($matches[0], 'OPERATOR');
|
||||
} else {
|
||||
$this->record($matches[0], 'DELIMITER');
|
||||
$this->consume_string($matches[0], 'REGEX');
|
||||
if ($this->scan('/[cgimosx]+/')) {
|
||||
$this->record($this->match(), 'KEYWORD');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// override function for 'quote-like operators'
|
||||
// e.g. m"hello", m'hello', m/hello/, m(hello), m(he()l()o())
|
||||
function str_override($matches) {
|
||||
|
||||
$this->pos( $this->pos() + strlen($matches[0]) );
|
||||
|
||||
$this->record($matches[0], 'DELIMITER');
|
||||
|
||||
$f = $matches[1];
|
||||
|
||||
$type = 'STRING';
|
||||
if ($f === 'm' || $f === 'qr' || $f === 's' || $f === 'tr'
|
||||
|| $f === 'y') $type = 'REGEX';
|
||||
elseif($f === 'qw') $type = 'SPLIT_STRING';
|
||||
|
||||
$this->consume_string($matches[3], $type);
|
||||
if ($f === 's' || $f === 'tr' || $f === 'y') {
|
||||
// s/tr/y take two strings, e.g. s/something/somethingelse/, so we
|
||||
// have to consume the next delimiter (if it exists) and consume the
|
||||
// string, again.
|
||||
|
||||
// if delims were balanced, there's a new delimiter right here, e.g.
|
||||
// s[something][somethingelse]
|
||||
$this->skip_whitespace();
|
||||
$balanced = LuminousUtils::balance_delimiter($matches[3]) !== $matches[3];
|
||||
if ($balanced) {
|
||||
$delim2 = $this->scan('/[^a-zA-Z0-9]/');
|
||||
if ($delim2 !== null) {
|
||||
$this->record($delim2, 'DELIMITER');
|
||||
$this->consume_string($delim2, 'STRING');
|
||||
}
|
||||
}
|
||||
// if they weren't balanced then the delimiter is the same, and has
|
||||
// already been consumed as the end-delim to the first pattern
|
||||
else {
|
||||
$this->consume_string($matches[3], 'STRING');
|
||||
}
|
||||
}
|
||||
if ($type === 'REGEX' && $this->scan('/[cgimosxpe]+/')) {
|
||||
$this->record($this->match(), 'KEYWORD');
|
||||
}
|
||||
}
|
||||
|
||||
// this override handles the heredoc declaration, and makes a note of it
|
||||
// it adds a new token (a newline) which is overridden to invoke the real
|
||||
// heredoc handling. This is because in Perl, heredocs declarations need not
|
||||
// be the end of the line so we can't necessarily start heredocing straight
|
||||
// away.
|
||||
function heredoc_override($matches) {
|
||||
list($group, $op, $quote1, $delim, $quote2) = $matches;
|
||||
$this->record($op, 'OPERATOR');
|
||||
// Now, if $quote1 is '\', then $quote2 is empty. If quote2 is empty
|
||||
// but quote1 is not '\', this is not a heredoc.
|
||||
if ($quote1 === '\\' && $quote2 === '') {
|
||||
$this->record($quote1 . $delim, 'DELIMITER');
|
||||
} elseif($quote2 === '' && $quote1 !== '') {
|
||||
// this is the error case
|
||||
// shift to the end of the op and break
|
||||
$this->pos_shift(strlen($op));
|
||||
return;
|
||||
} else {
|
||||
$this->record($quote1 . $delim . $quote2, 'DELIMITER');
|
||||
}
|
||||
$this->pos_shift(strlen($group));
|
||||
// TODO. the quotes (matches[2] and matches[4]) are ignored for now, but
|
||||
// they mean something w.r.t interpolation.
|
||||
|
||||
$this->heredoc = $delim;
|
||||
$this->add_pattern('HEREDOC_NL', "/\n/");
|
||||
$this->overrides['HEREDOC_NL'] = array($this, 'heredoc_real_override');
|
||||
}
|
||||
// this override handles the actual heredoc text
|
||||
function heredoc_real_override($matches) {
|
||||
$this->record($matches[0], null);
|
||||
$this->pos_shift(strlen($matches[0]));
|
||||
// don't need this anymore
|
||||
$this->remove_pattern('HEREDOC_NL');
|
||||
assert($this->heredoc !== null);
|
||||
$delim = preg_quote($this->heredoc);
|
||||
$substr = $this->scan_until('/^' . $delim . '\\b/m');
|
||||
if ($substr !== null) {
|
||||
$this->record($substr, 'HEREDOC');
|
||||
$delim_ = $this->scan('/' . $delim . '/');
|
||||
assert($delim !== null);
|
||||
$this->record($delim_, 'DELIMITER');
|
||||
} else {
|
||||
$this->record($this->rest(), 'HEREDOC');
|
||||
$this->terminate();
|
||||
}
|
||||
}
|
||||
|
||||
// halts highlighting on __DATA__ and __END__
|
||||
function term_override($matches) {
|
||||
$this->record($matches[0], 'DELIMITER');
|
||||
$this->pos( $this->pos() + strlen($matches[0]) );
|
||||
$this->record($this->rest(), null);
|
||||
$this->terminate();
|
||||
}
|
||||
// pod cuts might be very long and trigger the backtrack limit, so
|
||||
// we do it the old fashioned way
|
||||
function pod_cut_override($matches) {
|
||||
$line = $this->scan('/^=.*/m');
|
||||
assert($line !== null);
|
||||
$term = '/^=cut$|\\z/m';
|
||||
$substr = $this->scan_until($term);
|
||||
assert($substr !== null);
|
||||
$end = $this->scan($term);
|
||||
assert($end !== null);
|
||||
$this->record($line . $substr . $end, 'DOCCOMMENT');
|
||||
}
|
||||
|
||||
|
||||
function init() {
|
||||
|
||||
$this->add_pattern('COMMENT', '/#.*/');
|
||||
|
||||
// pod/cut documentation
|
||||
$this->add_pattern('podcut', '/^=[a-zA-Z_]/m');
|
||||
$this->overrides['podcut'] = array($this, 'pod_cut_override');
|
||||
|
||||
// variables
|
||||
$this->add_pattern('VARIABLE', '/[\\$%@][a-z_]\w*/i');
|
||||
// special variables http://www.kichwa.com/quik_ref/spec_variables.html
|
||||
$this->add_pattern('VARIABLE', '/\\$[\|%=\-~^\d&`\'+_\.\/\\\\,"#\\$\\?\\*O\\[\\];!@]/');
|
||||
|
||||
// `backticks` (shell cmd)
|
||||
$this->add_pattern('CMD', '/`(?: [^`\\\\]++ | \\\\ . )*+ (?:`|$)/x');
|
||||
// straight strings
|
||||
$this->add_pattern('STRING', LuminousTokenPresets::$DOUBLE_STR);
|
||||
$this->add_pattern('STRING', LuminousTokenPresets::$SINGLE_STR);
|
||||
// terminators
|
||||
$this->add_pattern('TERM', '/__(?:DATA|END)__/');
|
||||
// heredoc (overriden)
|
||||
$this->add_pattern('HEREDOC', '/(<<)([\'"`\\\\]?)([a-zA-Z_]\w*)(\\2?)/');
|
||||
// operators, slash is a special case and is overridden
|
||||
$this->add_pattern('OPERATOR', '/[!%^&*\-=+;:|,\\.?<>~\\\\]+/');
|
||||
$this->add_pattern('SLASH', '%//?%');
|
||||
// we care about 'openers' for regex-vs-division disambiguatation
|
||||
$this->add_pattern('OPENER', '%[\[\{\(]+%x');
|
||||
|
||||
$this->add_pattern('NUMERIC', LuminousTokenPresets::$NUM_HEX);
|
||||
$this->add_pattern('NUMERIC', LuminousTokenPresets::$NUM_REAL);
|
||||
|
||||
// quote-like operators. we override these.
|
||||
// I got these out of the old luminous tree, I don't know how accurate
|
||||
// or complete they are.
|
||||
// According to psh, delimiters can be escaped?
|
||||
$this->add_pattern('DELIMETERS',
|
||||
'/(q[rqxw]?|m|s|tr|y)([\s]*)(\\\\?[^a-zA-Z0-9\s])/');
|
||||
$this->add_pattern('IDENT', '/[a-zA-Z_]\w*/');
|
||||
|
||||
|
||||
$this->overrides['DELIMETERS'] = array($this, 'str_override');
|
||||
$this->overrides['SLASH'] = array($this, 'slash_override');
|
||||
$this->overrides['HEREDOC'] = array($this, 'heredoc_override');
|
||||
$this->overrides['TERM'] = array($this, 'term_override');
|
||||
|
||||
// map cmd to a 'function' and get rid of openers
|
||||
$this->rule_tag_map = array(
|
||||
'CMD' => 'FUNCTION',
|
||||
'OPENER' => null,
|
||||
);
|
||||
|
||||
// this sort of borks with the strange regex delimiters
|
||||
$this->remove_filter('pcre');
|
||||
|
||||
|
||||
/************************************************************************/
|
||||
// data definition follows.
|
||||
|
||||
// https://www.physiol.ox.ac.uk/Computing/Online_Documentation/Perl-5.8.6/index-functions-by-cat.html
|
||||
$this->add_identifier_mapping('KEYWORD', array( 'bless',
|
||||
'caller', 'continue', 'dbmclose', 'dbmopen',
|
||||
'defined',
|
||||
'delete', 'die', 'do', 'dump', 'else', 'elsif',
|
||||
'eval', 'exit', 'for', 'foreach', 'goto', 'import', 'if', 'last', 'local',
|
||||
'my',
|
||||
'next', 'no',
|
||||
'our', 'package', 'prototype', 'redo', 'ref', 'reset',
|
||||
'return', 'require', 'scalar', 'sub', 'tie', 'tied',
|
||||
'undef',
|
||||
'utie',
|
||||
'unless', 'use', 'wantarray', 'while'));
|
||||
$this->add_identifier_mapping('OPERATOR', array('lt', 'gt', 'le',
|
||||
'ge', 'eq', 'ne', 'cmp', 'and', 'or', 'xor'));
|
||||
|
||||
$this->add_identifier_mapping('FUNCTION', array(
|
||||
'chomp',
|
||||
'chop',
|
||||
'chr',
|
||||
'crypt',
|
||||
'hex',
|
||||
'index',
|
||||
'lc',
|
||||
'lcfirst',
|
||||
'length',
|
||||
'oct',
|
||||
'ord',
|
||||
'pack',
|
||||
'reverse',
|
||||
'rindex',
|
||||
'sprintf',
|
||||
'substr',
|
||||
'uc',
|
||||
'ucfirst',
|
||||
'pos',
|
||||
'quotemeta',
|
||||
'split',
|
||||
'study',
|
||||
'abs',
|
||||
'atan2',
|
||||
'cos',
|
||||
'exp',
|
||||
'hex',
|
||||
'int',
|
||||
'log',
|
||||
'oct',
|
||||
'rand',
|
||||
'sin',
|
||||
'sqrt',
|
||||
'srand',
|
||||
'pop',
|
||||
'push',
|
||||
'shift',
|
||||
'splice',
|
||||
'unshift',
|
||||
'grep',
|
||||
'join',
|
||||
'map',
|
||||
'reverse',
|
||||
'sort',
|
||||
'unpack',
|
||||
'delete',
|
||||
'each',
|
||||
'exists',
|
||||
'keys',
|
||||
'values',
|
||||
'binmode',
|
||||
'close',
|
||||
'closedir',
|
||||
'dbmclose',
|
||||
'dbmopen',
|
||||
'die',
|
||||
'eof',
|
||||
'fileno',
|
||||
'flock',
|
||||
'format',
|
||||
'getc',
|
||||
'print',
|
||||
'printf',
|
||||
'read',
|
||||
'readdir',
|
||||
'readline',
|
||||
'rewinddir',
|
||||
'seek',
|
||||
'seekdir',
|
||||
'select',
|
||||
'syscall',
|
||||
'sysread',
|
||||
'sysseek',
|
||||
'syswrite',
|
||||
'tell',
|
||||
'telldir',
|
||||
'truncate',
|
||||
'warn',
|
||||
'write',
|
||||
'pack',
|
||||
'read',
|
||||
'syscall',
|
||||
'sysread',
|
||||
'sysseek',
|
||||
'syswrite',
|
||||
'unpack',
|
||||
'vec',
|
||||
'chdir',
|
||||
'chmod',
|
||||
'chown',
|
||||
'chroot',
|
||||
'fcntl',
|
||||
'glob',
|
||||
'ioctl',
|
||||
'link',
|
||||
'lstat',
|
||||
'mkdir',
|
||||
'open',
|
||||
'opendir',
|
||||
'readlink',
|
||||
'rename',
|
||||
'rmdir',
|
||||
'stat',
|
||||
'symlink',
|
||||
'sysopen',
|
||||
'umask',
|
||||
'unlink',
|
||||
'utime',
|
||||
'alarm',
|
||||
'exec',
|
||||
'fork',
|
||||
'getpgrp',
|
||||
'getppid',
|
||||
'getpriority',
|
||||
'kill',
|
||||
'pipe',
|
||||
'qx/STRING/',
|
||||
'readpipe',
|
||||
'setpgrp',
|
||||
'setpriority',
|
||||
'sleep',
|
||||
'system',
|
||||
'times',
|
||||
'wait',
|
||||
'waitpid',
|
||||
'accept',
|
||||
'bind',
|
||||
'connect',
|
||||
'getpeername',
|
||||
'getsockname',
|
||||
'getsockopt',
|
||||
'listen',
|
||||
'recv',
|
||||
'send',
|
||||
'setsockopt',
|
||||
'shutdown',
|
||||
'socket',
|
||||
'socketpair',
|
||||
'msgctl',
|
||||
'msgget',
|
||||
'msgrcv',
|
||||
'msgsnd',
|
||||
'semctl',
|
||||
'semget',
|
||||
'semop',
|
||||
'shmctl',
|
||||
'shmget',
|
||||
'shmread',
|
||||
'shmwrite',
|
||||
'endgrent',
|
||||
'endhostent',
|
||||
'endnetent',
|
||||
'endpwent',
|
||||
'getgrent',
|
||||
'getgrgid',
|
||||
'getgrnam',
|
||||
'getlogin',
|
||||
'getpwent',
|
||||
'getpwnam',
|
||||
'getpwuid',
|
||||
'setgrent',
|
||||
'setpwent',
|
||||
'endprotoent',
|
||||
'endservent',
|
||||
'gethostbyaddr',
|
||||
'gethostbyname',
|
||||
'gethostent',
|
||||
'getnetbyaddr',
|
||||
'getnetbyname',
|
||||
'getnetent',
|
||||
'getprotobyname',
|
||||
'getprotobynumber',
|
||||
'getprotoent',
|
||||
'getservbyname',
|
||||
'getservbyport',
|
||||
'getservent',
|
||||
'sethostent',
|
||||
'setnetent',
|
||||
'setprotoent',
|
||||
'setservent',
|
||||
'gmtime',
|
||||
'localtime',
|
||||
'time',
|
||||
'times'));
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
public static function guess_language($src, $info) {
|
||||
// check the shebang
|
||||
if (preg_match('/^#!.*\\bperl\\b/', $src)) return 1.0;
|
||||
$p = 0;
|
||||
if (preg_match('/\\$[a-zA-Z_]+/', $src)) $p += 0.02;
|
||||
if (preg_match('/@[a-zA-Z_]+/', $src)) $p += 0.02;
|
||||
if (preg_match('/%[a-zA-Z_]+/', $src)) $p += 0.02;
|
||||
if (preg_match('/\\bsub\s+\w+\s*\\{/', $src)) $p += 0.1;
|
||||
if (preg_match('/\\bmy\s+[$@%]/', $src)) $p += 0.05;
|
||||
// $x =~ s/
|
||||
if (preg_match('/\\$[a-zA-Z_]\w*\s+=~\s+s\W/', $src)) $p += 0.15;
|
||||
return $p;
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user