comunic/3rdparty/luminous/languages/diff.php

<?php


/*
 * Diff is a strange one because we could just highlight the lines and be done 
 * with it, but we are actually going to try to highlight the source code AND 
 * the diff format
 * 
 * As such, we handle formatting and tagging inside the scanner.
 */
class LuminousDiffScanner extends LuminousScanner {

  public $patterns = array();
  public $pretty_mode = false; // pretty mode uses language sub-scanners
    // to try to highlight the embedded code
  
  /* TODO: plug this into the language code selector in the old EasyAPI
   * when we port it across 
   * This function is just a placeholder and will be implemented properly
   * later.
   */
  function get_child_scanner($filename) {
    // HACK - pretty mode should be reflected elsewhere than here.
    if (!$this->pretty_mode) return null;
    // $luminous_ is a singleton from the main calling API. It may or may not
    // exist here, but if it does, we're going to use it.
    global $luminous_;
    if (!isset($luminous_))
      return null;

    $spos = strrpos($filename, '.');
    if ($spos === false) {return null;}
    $ext = substr($filename, $spos+1);
    $s = $luminous_->scanners->GetScanner(strtolower($ext));
    // we actually only want the classname, not an instance.
    if ($s === null) return null;
    else return get_class($s);
  }
  

  function string($string=null) {
    if ($string !== null) {
      if (preg_match('/^[><]/m', $string)) {
        // normal rules
        $this->patterns['range'] = '/\d+.*/';
        $this->patterns['codeblock'] = "/(^([<> ]).*(\n)?)+/m";
      }
      elseif (preg_match('/^\*{3}/m', $string)) {
        // context
        $this->patterns['range'] = "/([\-\*]{3})[ \t]+\d+,\d+[ \t]+\\1.*/";
        $this->patterns['codeblock'] = "/(^([!+ ]).*(\n)?)+/m";
      }
      else {
        // unified
        $this->patterns['range'] = "/@@.*/";
        $this->patterns['codeblock'] = "/(^([+\- ]).*(\n)?)+/m";
      }
    }

    return parent::string($string);

  }
  
  function main() {
    // we're aiming to handle context, unified and normal diff all at once here
    // because it doesn't really seem that hard.
    $child = null;
    $last_index = -1;
    while (!$this->eos()) {
      $index = $this->pos();
      assert($index > $last_index);
      $last_index = $index;
      
      assert($this->bol());
      
      $tok = null;
      if ($this->scan('/diff\s.*$/m') !== null)  $tok = 'KEYWORD';
      // normal, context and unified ranges
      elseif($this->scan($this->patterns['range']) !== null)
        $tok = 'DIFF_RANGE';
      elseif($this->scan("/-{3}[ \t]*$/m")) $tok = null;
      
      elseif($this->scan('/(?:\**|=*|\w.*)$/m') !== null) $tok = 'KEYWORD';
      // this is a header line which may contain a file path. If it does,
      // update the child scanner according to its extension.
      elseif($this->scan("@[+\-\*]{3}(\s+([^\s]*)([ \t]|$))?.*@m") !== null) {
        $m = $this->match_groups();
        // unified uses +++, context uses *
        if ($m[0][0] === '+' || $m[0][0] === '*')
          $tok = 'DIFF_HEADER_NEW';
        else $tok = 'DIFF_HEADER_OLD';
        if (isset($m[2])) {
          $filename = preg_replace('@.*\\\\/@', '', $m[2]);
          $child = self::get_child_scanner($filename);  
        }
      }
      elseif($this->scan('/\\\\.*/') !== null) $tok = null;
      elseif($this->scan($this->patterns['codeblock']) !== null) {
          
        // this is actual source code.
        // we're going to format this here.
        // we're going to extract the block, and try to re-assemble it as
        // verbatim code, then highlight it via a child scanner, then split up
        // the lines, re-apply the necessary prefixes (e.g. + or -) to them,
        // and store them as being a DIFF_ token.
        // we have to do it like this, rather than line by line, otherwise
        // multiline tokens aren't going to work properly. There's stilla  risk
        // that the diff will be fragmented such the child scanner gets it
        // wrong but that can't be helped.

        // TODO restructure this so the complicated bits aren't done if there's
        // no child scanner to pass it down to

        $block = $this->match();
        if (!strlen($block)) {
          assert(0);
        }

        $lines = explode("\n", $block);
        $verbatim = array();
        $verbatim_ = '';
        $types = array();
        $prefixes = array();
        foreach($lines as $l) {
          if (!strlen($l) || $l[0] === ' ')
            $types[]= 'DIFF_UNCHANGED';
          elseif ($l[0] === '+' || $l[0] === '>')
            $types[] = 'DIFF_NEW';
          elseif ($l[0] === '!' || $l[0] === '<' || $l[0] === '-')
            $types[] = 'DIFF_OLD';
          else assert(0);
          $prefixes[] = (isset($l[0]))? $l[0] : '';
          $verbatim_[] = substr($l, 1);
        }
        $verbatim = implode("\n", $verbatim_);
        $escaped = false;
        $tagged;
        if ($child !== null) {
          $c = new $child;
          $c->init();
          $c->string($verbatim);
          $c->main();
          $tagged = $c->tagged();
          $escaped = true;
        } else {
          $tagged = $verbatim;
        }
        $exp = explode("\n", $tagged);
        assert(count($exp) === count($prefixes));
        foreach($exp as $i=>$v) {
          $t = $types[$i];
          // if the sub-scanner escaped the line, we also need to escape the
          // prefix for consistency
          $prefix = $prefixes[$i];
          if ($escaped) $prefix = LuminousUtils::escape_string($prefix);
          $text = $prefix . $v;
          $this->record(
            $text,
            $t,
            $escaped);
          if ($i < count($exp)-1) $this->record("\n", null);
        }
        if ($this->eol()) $this->record($this->get(), null);
        continue;
        }
      else $this->scan('/.*/');

      // previous else clause can capture empty strings
      if ($this->match() !== '') 
        $this->record($this->match(), $tok);
      
      assert($this->eol());
      // consume newline
      if (!$this->eos()) $this->record($this->get(), null);
      
    }
  }

  static function guess_language($src, $info) {
    // diff isn't too hard. We check for 'index' and a few other things
    $p = 0.0;
    if (preg_match("/^-{3}.*+[\n\r]++\\+{3}/m", $src)) $p = 0.25;
    if (preg_match('/^@@.*@@/m', $src)) $p += 0.25;
    if (preg_match('/^(index|diff)\\b/m', $src)) $p += 0.10;


    // finally we look for the diff markers at the line starts
    // we're going to use the remaining 40% of the probability as so:
    // We'll say a perfect match for diff has 
    // 10%+ of its lines starting with the +/- markers (</> or +/! for 
    // context/original format), and we'll scale real proportion 
    // to fill up the remaining 0.4
    $c = preg_match_all('/^[<>+\\-!]\s/m', $src, $m);
    $num_lines = $info['num_lines'];
    if ($num_lines > 0) {
      $proportion = $c/$num_lines;
      $proportion = min(0.1, $proportion);
      $p += 0.4 * ($proportion * 10);
    }
    return $p;
  }
 
}


class LuminousPrettyDiffScanner extends LuminousDiffScanner {
  public $pretty_mode = true;
}
First commit 2016-11-19 12:08:12 +01:00			`<?php`


			`/*`
			`* Diff is a strange one because we could just highlight the lines and be done`
			`* with it, but we are actually going to try to highlight the source code AND`
			`* the diff format`
			`*`
			`* As such, we handle formatting and tagging inside the scanner.`
			`*/`
			`class LuminousDiffScanner extends LuminousScanner {`

			`public $patterns = array();`
			`public $pretty_mode = false; // pretty mode uses language sub-scanners`
			`// to try to highlight the embedded code`

			`/* TODO: plug this into the language code selector in the old EasyAPI`
			`* when we port it across`
			`* This function is just a placeholder and will be implemented properly`
			`* later.`
			`*/`
			`function get_child_scanner($filename) {`
			`// HACK - pretty mode should be reflected elsewhere than here.`
			`if (!$this->pretty_mode) return null;`
			`// $luminous_ is a singleton from the main calling API. It may or may not`
			`// exist here, but if it does, we're going to use it.`
			`global $luminous_;`
			`if (!isset($luminous_))`
			`return null;`

			`$spos = strrpos($filename, '.');`
			`if ($spos === false) {return null;}`
			`$ext = substr($filename, $spos+1);`
			`$s = $luminous_->scanners->GetScanner(strtolower($ext));`
			`// we actually only want the classname, not an instance.`
			`if ($s === null) return null;`
			`else return get_class($s);`
			`}`


			`function string($string=null) {`
			`if ($string !== null) {`
			`if (preg_match('/^[><]/m', $string)) {`
			`// normal rules`
			`$this->patterns['range'] = '/\d+.*/';`
			`$this->patterns['codeblock'] = "/(^([<> ]).*(\n)?)+/m";`
			`}`
			`elseif (preg_match('/^\*{3}/m', $string)) {`
			`// context`
			`$this->patterns['range'] = "/([\-\]{3})[ \t]+\d+,\d+[ \t]+\\1./";`
			`$this->patterns['codeblock'] = "/(^([!+ ]).*(\n)?)+/m";`
			`}`
			`else {`
			`// unified`
			`$this->patterns['range'] = "/@@.*/";`
			`$this->patterns['codeblock'] = "/(^([+\- ]).*(\n)?)+/m";`
			`}`
			`}`

			`return parent::string($string);`

			`}`

			`function main() {`
			`// we're aiming to handle context, unified and normal diff all at once here`
			`// because it doesn't really seem that hard.`
			`$child = null;`
			`$last_index = -1;`
			`while (!$this->eos()) {`
			`$index = $this->pos();`
			`assert($index > $last_index);`
			`$last_index = $index;`

			`assert($this->bol());`

			`$tok = null;`
			`if ($this->scan('/diff\s.*$/m') !== null) $tok = 'KEYWORD';`
			`// normal, context and unified ranges`
			`elseif($this->scan($this->patterns['range']) !== null)`
			`$tok = 'DIFF_RANGE';`
			`elseif($this->scan("/-{3}[ \t]*$/m")) $tok = null;`

			`elseif($this->scan('/(?:\*\|=\|\w.*)$/m') !== null) $tok = 'KEYWORD';`
			`// this is a header line which may contain a file path. If it does,`
			`// update the child scanner according to its extension.`
			`elseif($this->scan("@[+\-\]{3}(\s+([^\s])([ \t]\|$))?.*@m") !== null) {`
			`$m = $this->match_groups();`
			`// unified uses +++, context uses *`
			`if ($m[0][0] === '+' \|\| $m[0][0] === '*')`
			`$tok = 'DIFF_HEADER_NEW';`
			`else $tok = 'DIFF_HEADER_OLD';`
			`if (isset($m[2])) {`
			`$filename = preg_replace('@.*\\\\/@', '', $m[2]);`
			`$child = self::get_child_scanner($filename);`
			`}`
			`}`
			`elseif($this->scan('/\\\\.*/') !== null) $tok = null;`
			`elseif($this->scan($this->patterns['codeblock']) !== null) {`

			`// this is actual source code.`
			`// we're going to format this here.`
			`// we're going to extract the block, and try to re-assemble it as`
			`// verbatim code, then highlight it via a child scanner, then split up`
			`// the lines, re-apply the necessary prefixes (e.g. + or -) to them,`
			`// and store them as being a DIFF_ token.`
			`// we have to do it like this, rather than line by line, otherwise`
			`// multiline tokens aren't going to work properly. There's stilla risk`
			`// that the diff will be fragmented such the child scanner gets it`
			`// wrong but that can't be helped.`

			`// TODO restructure this so the complicated bits aren't done if there's`
			`// no child scanner to pass it down to`

			`$block = $this->match();`
			`if (!strlen($block)) {`
			`assert(0);`
			`}`

			`$lines = explode("\n", $block);`
			`$verbatim = array();`
			`$verbatim_ = '';`
			`$types = array();`
			`$prefixes = array();`
			`foreach($lines as $l) {`
			`if (!strlen($l) \|\| $l[0] === ' ')`
			`$types[]= 'DIFF_UNCHANGED';`
			`elseif ($l[0] === '+' \|\| $l[0] === '>')`
			`$types[] = 'DIFF_NEW';`
			`elseif ($l[0] === '!' \|\| $l[0] === '<' \|\| $l[0] === '-')`
			`$types[] = 'DIFF_OLD';`
			`else assert(0);`
			`$prefixes[] = (isset($l[0]))? $l[0] : '';`
			`$verbatim_[] = substr($l, 1);`
			`}`
			`$verbatim = implode("\n", $verbatim_);`
			`$escaped = false;`
			`$tagged;`
			`if ($child !== null) {`
			`$c = new $child;`
			`$c->init();`
			`$c->string($verbatim);`
			`$c->main();`
			`$tagged = $c->tagged();`
			`$escaped = true;`
			`} else {`
			`$tagged = $verbatim;`
			`}`
			`$exp = explode("\n", $tagged);`
			`assert(count($exp) === count($prefixes));`
			`foreach($exp as $i=>$v) {`
			`$t = $types[$i];`
			`// if the sub-scanner escaped the line, we also need to escape the`
			`// prefix for consistency`
			`$prefix = $prefixes[$i];`
			`if ($escaped) $prefix = LuminousUtils::escape_string($prefix);`
			`$text = $prefix . $v;`
			`$this->record(`
			`$text,`
			`$t,`
			`$escaped);`
			`if ($i < count($exp)-1) $this->record("\n", null);`
			`}`
			`if ($this->eol()) $this->record($this->get(), null);`
			`continue;`
			`}`
			`else $this->scan('/.*/');`

			`// previous else clause can capture empty strings`
			`if ($this->match() !== '')`
			`$this->record($this->match(), $tok);`

			`assert($this->eol());`
			`// consume newline`
			`if (!$this->eos()) $this->record($this->get(), null);`

			`}`
			`}`

			`static function guess_language($src, $info) {`
			`// diff isn't too hard. We check for 'index' and a few other things`
			`$p = 0.0;`
			`if (preg_match("/^-{3}.*+[\n\r]++\\+{3}/m", $src)) $p = 0.25;`
			`if (preg_match('/^@@.*@@/m', $src)) $p += 0.25;`
			`if (preg_match('/^(index\|diff)\\b/m', $src)) $p += 0.10;`


			`// finally we look for the diff markers at the line starts`
			`// we're going to use the remaining 40% of the probability as so:`
			`// We'll say a perfect match for diff has`
			`// 10%+ of its lines starting with the +/- markers (</> or +/! for`
			`// context/original format), and we'll scale real proportion`
			`// to fill up the remaining 0.4`
			`$c = preg_match_all('/^[<>+\\-!]\s/m', $src, $m);`
			`$num_lines = $info['num_lines'];`
			`if ($num_lines > 0) {`
			`$proportion = $c/$num_lines;`
			`$proportion = min(0.1, $proportion);`
			`$p += 0.4 * ($proportion * 10);`
			`}`
			`return $p;`
			`}`

			`}`


			`class LuminousPrettyDiffScanner extends LuminousDiffScanner {`
			`public $pretty_mode = true;`
			`}`