comunic/3rdparty/luminous/languages/ruby.php
2016-11-19 12:08:12 +01:00

523 lines
17 KiB
PHP
Executable File

<?php
/*
* Ruby's grammar is basically insane. We're not going to aim to correctly
* highlight all legal Ruby code because we'll be here all year and we'll still
* get it wrong, but we're going to have a go at getting the standard stuff
* right as well as:
* heredocs
* balanced AND NESTED string/regex delimiters
* interpolation
*
* disclaimer: I don't actually know Ruby.
*
* Problem is that Ruby *appears* to have to disambiguate loads of stuff at
* runtime, which is frankly a little optimistic for a syntax highlighter.
* Ruby allows you to omit calling parantheses, so it's not practical (and
* impossible if the code snippet is incomplete) to figure out operator/operand
* position. e.g.
* x = y %r/z/x
* is x = y mod r div z div x, unless y is a function, in which case it's:
* x = y( /z/x ) where /z/x is a regex
*/
class LuminousRubyScanner extends LuminousScanner {
// set to true if this is a nested scanner which needs to exit if it
// encounters a } while nothing else is on the stack, i.e. it is being
// used to process an interpolated block
public $interpolation = false;
protected $curley_braces = 0; // poor man's curly brace stack.
public $rails = false;
// operators depend somewhat on whether or not rails is active, else we
// don't want to consume a '%' if it comes right before a '>', we want
// to leave that for the rails close-tag detection
private $operator_regex = null;
private $string_regex = null;
private $comment_regex = null;
// gaaah
private $numeric = '/
(?:
#control codes
(?:\?(?:\\\[[:alpha:]]-)*[[:alpha:]])
|
#hex
(?:0[xX](?>[0-9A-Fa-f]+)[lL]*)
|
# binary
(?:0[bB][0-1]+)
|
#octal
(?:0[oO0][0-7]+)
|
# regular number
(?:
(?>[0-9]+)
(?:
# fraction
(?:
(?:\.?(?>[0-9]+)?
(?:(?:[eE][\+\-]?)?(?>[0-9]+))?
)
)
)?
)
|
(
# or only after the point, float x = .1;
\.(?>[0-9]+)(?:(?:[eE][\+\-]?)?(?>[0-9]+))?
)
)
(?:_+\d+)*
/x';
/// queue of heredoc declarations which will need to be handled as soon as EOL is reached
/// each element is a tuple: (delimiter(str), identable?, interpolatable?)
private $heredocs = array();
public function init() {
$this->comment_regex =
$this->rails? "/ \# (?: [^\n%]*+ | %(?!>))* /x"
: '/#.*/';
// http://www.zenspider.com/Languages/Ruby/QuickRef.html#23
$this->operator_regex = '/
\? | ;
| ::? | \*[=\*]? | \/=? | -=? | %=? | ^=? | &&? | \|\|? | \.{2,3}
| \^=?
| < (?:=>|<|=)? | >=?
| =[>~] | ={1,3}
| \+=? | ![=~]?
/x';
// $this->operator_regex = '/(?: [~!^&*\-+=:;|<>\/?';
// if ($this->rails) $this->operator_regex .= ']+|%(?!>))+';
// else $this->operator_regex .= '%]+)';
// $this->operator_regex .= '/x';
$this->add_identifier_mapping('KEYWORD', array('BEGIN', 'END', 'alias',
'begin', 'break', 'case', 'class', 'def', 'defined?', 'do',
'else', 'elsif', 'end', 'ensure', 'for', 'if', 'module', 'next',
'redo', 'rescue', 'retry', 'return', 'self', 'super', 'then',
'undef', 'unless', 'until', 'when', 'while', 'yield',
'false', 'nil', 'self', 'true', '__FILE__', '__LINE__', 'TRUE', 'FALSE',
'NIL', 'STDIN', 'STDERR', 'ENV', 'ARGF', 'ARGV', 'DATA', 'RUBY_VERSION',
'RUBY_RELEASE_DATE', 'RUBY_PLATFORM',
'and', 'in', 'not', 'or',
'public', 'private', 'protected'
));
// http://www.tutorialspoint.com/ruby/ruby_builtin_functions.htm
// don't know how reliable that is... doesn't look incredibly inspiring
$this->add_identifier_mapping('FUNCTION', array('abord', 'Array',
'at_exit', 'autoload', 'binding', 'block_given?', 'callcc', 'caller',
'catch', 'chomp', 'chomp!', 'chop', 'chop!', 'eval', 'exec', 'exit',
'exit!', 'fail', 'Float', 'fork', 'format', 'gets', 'global_variables',
'gsub', 'gsub!', 'Integer', 'lambda', 'proc', 'load', 'local_variables',
'loop', 'open', 'p', 'print', 'printf', 'proc', 'puts', 'raise', 'fail',
'rand', 'readlines', 'require', 'scan', 'select', 'set_trace_func',
'sleep', 'split', 'sprintf', 'srand', 'String', 'syscall', 'system',
'sub', ',sub!', 'test', 'throw', 'trace_var', 'trap', 'untrace_var',
'abs', 'ceil', 'coerce', 'divmod', 'floor', 'integer?', 'modulo',
'nonzero?', 'remainder', 'round', 'truncate', 'zero?', 'chr', 'size',
'step', 'times', 'to_f', 'to_int', 'to_i', 'finite?', 'infinite?',
'nan?', 'atan2', 'cos', 'exp', 'frexp', 'ldexp', 'log', 'log10', 'sin',
'sqrt', 'tan'));
// this can break a bit with Ruby's whacky syntax
$this->remove_filter('pcre');
// don't want this.
$this->remove_filter('comment-to-doc');
$this->add_filter('REGEX', create_function('$tok',
'return LuminousFilters::pcre($tok, (isset($tok[1][0]) && $tok[1][0] === "/"));'));
}
protected function is_regex() {
/*
* Annoyingly I don't really know exactly what rules Ruby uses for
* disambiguating regular expressions. There might be some incorrect
* assumptions in here.
*/
if ($this->check('%/=\s%'))
return false;
$following_space = (bool)$this->check("%/[ \t]%");
$space = false;
for($i=count($this->tokens)-1; $i>=0; $i--) {
$tok = $this->tokens[$i];
if ($tok[0] === 'COMMENT') continue;
elseif ($tok[0] === 'OPERATOR') return true;
elseif($tok[0] === 'STRING') return true;
elseif ($tok[1] === '(' || $tok[1] === ',' || $tok[1] === '{' ||
$tok[1] === '[') {
// this is definitely an operand
return true;
}
elseif($tok[0] === null) {
$space = true;
continue;
}
elseif($tok[0] === 'NUMERIC') {
// this is definitely an operator
return false;
}
elseif ($tok[0] === 'IDENT'
|| $tok[0] === 'CONSTANT'
|| $tok[0] === 'VALUE' // aka :symbols
) {
// this could be an operator or operand
// Kate's syntax engine seems to operate on the following basis:
if ($space && $following_space) return false;
return $space;
}
return false;
}
return true; // no preceding tokens, presumably a code fragment.
}
protected function interpolate() {
$interpolation_scanner = new LuminousRubyScanner();
$interpolation_scanner->string($this->string());
$interpolation_scanner->pos($this->pos());
$interpolation_scanner->interpolation = true;
$interpolation_scanner->init();
$interpolation_scanner->main();
$this->record($interpolation_scanner->tagged(), 'INTERPOLATION', true);
$this->pos($interpolation_scanner->pos());
}
// handles the heredoc array. Call at eol/bol when the heredoc queue is
// not empty
protected function do_heredoc() {
assert (!empty($this->heredocs));
$start = $this->pos();
for($i=0; $i<count($this->heredocs) ; ) {
$top = $this->heredocs[$i];
list($ident, $identable, $interpolatable) = $top;
$searches = array(
sprintf('/^%s%s\\b/m', $identable? "[ \t]*" : '',
preg_quote($ident, '/'))
);
if ($interpolatable)
$searches[] = '/\#\{/';
list($next, $matches) = $this->get_next($searches);
if ($next === -1) {
// no match for end delim, run to EOS
$this->record(substr($this->string(), $start), 'HEREDOC');
$this->terminate();
break;
}
assert($matches !== null);
if ($matches[0] === '#{') { // interpolation, break heredoc and do that.
$this->pos($next);
$this->record(substr($this->string(), $start, $this->pos()-$start), 'HEREDOC');
$this->record($matches[0], 'DELIMITER');
$this->pos_shift(strlen($matches[0]));
$this->interpolate();
if ($this->peek() === '}')
$this->record($this->get(), 'DELIMITER');
$start = $this->pos();
}
else {
//
$this->pos($next);
$this->record(substr($this->string(), $start, $this->pos()-$start), 'HEREDOC');
$this->record($matches[0], 'DELIMITER');
$this->pos($next + strlen($matches[0]));
$start = $this->pos();
$i++;
}
// subscanner might have consumed all the string, in which case there's
// no point continuing
if ($this->eos()) break;
}
// we may or may not have technically addressed all the heredocs in the
// queue, but we do want to clear them out now
$this->heredocs = array();
}
private function record_string_range($from, $to, $type, $split) {
if ($to === $from) return;
$substr = substr($this->string(), $from, $to-$from);
if ($split) {
foreach(preg_split('/(\s+)/', $substr, -1, PREG_SPLIT_DELIM_CAPTURE) as $s) {
$type_ = preg_match('/^\s+$/', $s)? null : $type;
$this->record($s, $type_);
}
} else {
$this->record($substr, $type);
}
}
// handles string types (inc regexes), which may have nestable delimiters or
// interpolation.
// strdata is defined in the big ugly block in main()
// TODO: proper docs
protected function do_string($str_data) {
list($type, $open_delimiter, $close_delimiter, $pos, $interpolation,
$fancy_delim, $split) = $str_data;
$balanced = $open_delimiter !== $close_delimiter;
$template = '/(?<!\\\\)((?:\\\\\\\\)*)(%s)/';
$patterns = array();
$patterns['term'] = sprintf($template, preg_quote($close_delimiter, '/'));
if ($balanced) {
// for nesting balanced delims
$patterns['nest'] = sprintf($template, preg_quote($open_delimiter, '/'));
}
if ($interpolation) {
$patterns['interp'] = sprintf($template, preg_quote('#{', '/'));
}
$nesting_level = 0;
$break = false;
while (!$break) {
list($name, $index, $matches) = $this->get_next_named($patterns);
if ($name === null) {
// special case, no matches, record the rest of the string and break
// immediately
$this->record_string_range($pos, strlen($this->string()), $type, $split);
$this->terminate();
break;
}
elseif ($name === 'nest') {
// nestable opener
$nesting_level++;
$this->pos( $index + strlen($matches[0]) );
}
elseif($name === 'term') {
// terminator, may be nested
if ($nesting_level === 0) {
// wasn't nested, real terminator.
if ($fancy_delim) {
// matches[1] is either empty or a sequence of backslashes
$this->record_string_range($pos, $index+strlen($matches[1]), $type, $split);
$this->record($matches[2], 'DELIMITER');
} else {
$this->record_string_range($pos, $index+strlen($matches[0]), $type, $split);
}
$break = true;
}
else {
// pop a nesting level
$nesting_level--;
}
$this->pos( $index + strlen($matches[0]) );
}
elseif($name === 'interp') {
// interpolation - temporarily break string highlighting, then
// do interpolation, then resume.
$this->record_string_range($pos, $index + strlen($matches[1]), $type, $split);
$this->record($matches[2], 'DELIMITER');
$this->pos( $index + strlen($matches[0]) );
$this->interpolate();
if (($c = $this->peek()) === '}')
$this->record($this->get(), 'DELIMITER');
$pos = $this->pos();
}
else {
assert(0);
}
if ($break) break;
}
if ($type === 'REGEX' && $this->scan('/[iomx]+/'))
$this->record($this->match(), 'KEYWORD');
}
public function main() {
while (!$this->eos()) {
if ($this->bol() && !empty($this->heredocs)) {
$this->do_heredoc();
}
if ($this->interpolation) {
$c = $this->peek();
if ($c === '{') $this->curley_braces++;
elseif($c === '}') {
$this->curley_braces--;
if ($this->curley_braces <= 0) { break;}
}
}
if ($this->rails && $this->check('/-?%>/')) {
break;
}
$c = $this->peek();
if ($c === '=' && $this->scan('/^=begin .*? (^=end|\\z)/msx')) {
$this->record($this->match(), 'DOCCOMMENT');
}
elseif($c === '#' && $this->scan($this->comment_regex))
$this->record($this->match(), 'COMMENT');
elseif($this->scan($this->numeric) !== null) {
$this->record($this->match(), 'NUMERIC');
}
elseif( $c === '$' && $this->scan('/\\$
(?:
(?:[!@`\'\+1~=\/\\\,;\._0\*\$\?:"&<>])
|
(?: -[0adFiIlpvw])
|
(?:DEBUG|FILENAME|LOAD_PATH|stderr|stdin|stdout|VERBOSE)
)/x') || $this->scan('/(\\$|@@?)\w+/')) {
$this->record($this->match(), 'VARIABLE');
}
elseif($this->scan('/:\w+/')) {
$this->record($this->match(), 'VALUE');
}
elseif ( $c === '<' && $this->scan('/(<<(-?))([\'"`]?)([A-Z_]\w*)(\\3)/i')) {
$m = $this->match_groups();
$this->record($m[0], 'DELIMITER');
$hdoc = array($m[4], $m[2] === '-', $m[3] !== "'");
$this->heredocs[] = $hdoc;
}
// TODO: "% hello " is I think a valid string, using whitespace as
// delimiters. We're going to disallow this for now because
// we're not disambiguating between that and modulus
elseif (($c === '"' || $c === "'" || $c === '`' || $c === '%') &&
$this->scan('/[\'"`]|%( [qQrswWx](?![[:alnum:]]|$) | (?![[:alnum:]\s]|$))/xm')
|| ($c === '/' && $this->is_regex())
)
{
$interpolation = false;
$type = 'STRING';
$delimiter;
$pos;
$fancy_delim = false;
$split = false;
if ($c === '/') {
$interpolation = true;
$type = 'REGEX';
$delimiter = $c;
$pos = $this->pos();
$this->get();
} else {
$pos = $this->match_pos();
$delimiter = $this->match();
if ($delimiter === '"') {
$interpolation = true;
} elseif($delimiter === "'") {}
elseif($delimiter === '`') {
$type = 'FUNCTION';
}
else {
$delimiter = $this->get();
$m1 = $this->match_group(1);
if ($m1 === 'Q' || $m1 === 'r' || $m1 === 'W' || $m1 === 'x')
$interpolation = true;
if ($m1 === 'w' || $m1 === 'W')
$split = true;
if ($m1 === 'x') $type = 'FUNCTION';
elseif($m1 === 'r') $type = 'REGEX';
$fancy_delim = true;
$this->record($this->match() . $delimiter, 'DELIMITER');
$pos = $this->pos();
}
}
$data = array($type, $delimiter, LuminousUtils::balance_delimiter($delimiter),
$pos, $interpolation, $fancy_delim, $split);
$this->do_string($data);
}
elseif( (ctype_alpha($c) || $c === '_') &&
($m = $this->scan('/[_a-zA-Z]\w*[!?]?/')) !== null) {
$this->record($m, ctype_upper($m[0])? 'CONSTANT' : 'IDENT');
if ($m === '__END__') {
if (!$this->interpolation) {
$this->record($this->rest(), null);
$this->terminate();
}
break;
}
}
elseif($this->scan($this->operator_regex))
$this->record($this->match(), 'OPERATOR');
elseif($this->scan("/[ \t]+/")) $this->record($this->match(), null);
else {
$this->record($this->get(), null);
}
}
// In case not everything was popped
if (isset($this->state_[0])) {
$this->record(
substr($this->string(), $this->state_[0][3],
$this->pos() - $this->state_[0][3]),
$this->state_[0][0]
);
$this->terminate();
}
}
public static function guess_language($src, $info) {
if (strpos($info['shebang'], 'ruby') !== false) return 1.0;
elseif($info['shebang']) return 0;
$p = 0;
if (strpos($src, 'nil')) $p += 0.05;
if (strpos($src, '.nil?')) $p += 0.02;
if (strpos($src, '.empty?')) $p += 0.02;
// interpolation
if (strpos($src, '#{$')) $p += 0.02;
// @ and $ vars
if (preg_match('/@[a-zA-Z_]/', $src) && preg_match('/\\$[a-zA-Z_]/', $src))
$p += 0.02;
// symbols
if (preg_match('/:[a-zA-Z_]/', $src)) $p += 0.01;
// func def - no args
if (preg_match("/^\s*+def\s++[a-zA-Z_]\w*+[ \t]*+[\n\r]/m", $src))
$p += 0.1;
// {|x[,y[,z...]]| is a very ruby-like construct
if (preg_match('/ \\{ \\|
\s*+ [a-zA-Z_]\w*+ \s*+
(,\s*+[a-zA-Z_]\w*+\s*+)*+
\\|/x', $src))
$p += 0.15;
// so is 'do |x|'
if (preg_match("/\\bdo\s*+\\|[^\\|\r\n]++\\|/", $src))
$p += 0.05;
// class defs with inheritance has quite distinct syntax
// class x < y
if (preg_match(
"/^ \s* class \s+ \w+ \s* < \s* \w+(::\w+)* [\t ]*+ [\r\n] /mx",
$src))
$p += 0.1;
$num_lines = $info['num_lines'];
// let's say if 5% of lines are hash commented that's a good thing
if (substr_count($src, '#') > $num_lines/20) $p += 0.05;
// =~ /regex/
if (preg_match('%=~\s++/%', $src)) $p += 0.02;
if (preg_match('/unless\s+[^\?]++\?/', $src)) $p += 0.05;
if (preg_match('/^(\s*+)def\s+.*^\1end\s/ms', $src)) $p += 0.05;
if (preg_match('/\.to_\w+(?=\s|$)/', $src)) $p += 0.01;
return $p;
}
}