1
0
Fork 0

Rewrite php file cleaning step to be less regex intensive and support extreme cases better, fixes #10106 (#10107)

pull/10150/head
Jordi Boggiano 2021-10-02 14:39:39 +02:00 committed by GitHub
parent d99b200cf3
commit d64d1adf61
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 257 additions and 52 deletions

View File

@ -214,10 +214,7 @@ class ClassMapGenerator
*/ */
private static function findClasses($path) private static function findClasses($path)
{ {
$extraTypes = PHP_VERSION_ID < 50400 ? '' : '|trait'; $extraTypes = self::getExtraTypes();
if (PHP_VERSION_ID >= 80100 || (defined('HHVM_VERSION') && version_compare(HHVM_VERSION, '3.3', '>='))) {
$extraTypes .= '|enum';
}
// Use @ here instead of Silencer to actively suppress 'unhelpful' output // Use @ here instead of Silencer to actively suppress 'unhelpful' output
// @link https://github.com/composer/composer/pull/4886 // @link https://github.com/composer/composer/pull/4886
@ -241,57 +238,14 @@ class ClassMapGenerator
} }
// return early if there is no chance of matching anything in this file // return early if there is no chance of matching anything in this file
if (!preg_match('{\b(?:class|interface'.$extraTypes.')\s}i', $contents)) { preg_match_all('{\b(?:class|interface'.$extraTypes.')\s}i', $contents, $matches);
if (!$matches) {
return array(); return array();
} }
// strip heredocs/nowdocs $p = new PhpFileCleaner($contents, count($matches[0]));
$heredocRegex = '{ $contents = $p->clean();
# opening heredoc/nowdoc delimiter (word-chars) unset($p);
<<<[ \t]*+([\'"]?)(\w++)\\1
# needs to be followed by a newline
(?:\r\n|\n|\r)
# the meat of it, matching line by line until end delimiter
(?:
# a valid line is optional white-space (possessive match) not followed by the end delimiter, then anything goes for the rest of the line
[\t ]*+(?!\\2 \b)[^\r\n]*+
# end of line(s)
[\r\n]++
)*
# end delimiter
[\t ]*+ \\2 (?=\b)
}x';
// run first assuming the file is valid unicode
$contentWithoutHeredoc = preg_replace($heredocRegex.'u', 'null', $contents);
if (null === $contentWithoutHeredoc) {
// run again without unicode support if the file failed to be parsed
$contents = preg_replace($heredocRegex, 'null', $contents);
} else {
$contents = $contentWithoutHeredoc;
}
unset($contentWithoutHeredoc);
// strip strings
$contents = preg_replace('{"[^"\\\\]*+(\\\\.[^"\\\\]*+)*+"|\'[^\'\\\\]*+(\\\\.[^\'\\\\]*+)*+\'}s', 'null', $contents);
// strip leading non-php code if needed
if (strpos($contents, '<?') !== 0) {
$contents = preg_replace('{^.+?<\?}s', '<?', $contents, 1, $replacements);
if ($replacements === 0) {
return array();
}
}
// strip non-php blocks in the file
$contents = preg_replace('{\?>(?:[^<]++|<(?!\?))*+<\?}s', '?><?', $contents);
// strip trailing non-php code if needed
$pos = strrpos($contents, '?>');
if (false !== $pos && false === strpos(substr($contents, $pos), '<?')) {
$contents = substr($contents, 0, $pos);
}
// strip comments if short open tags are in the file
if (preg_match('{(<\?)(?!(php|hh))}i', $contents)) {
$contents = preg_replace('{//.* | /\*(?:[^*]++|\*(?!/))*\*/}x', '', $contents);
}
preg_match_all('{ preg_match_all('{
(?: (?:
@ -328,4 +282,18 @@ class ClassMapGenerator
return $classes; return $classes;
} }
private static function getExtraTypes()
{
static $extraTypes = null;
if (null === $extraTypes) {
$extraTypes = PHP_VERSION_ID < 50400 ? '' : '|trait';
if (PHP_VERSION_ID >= 80100 || (defined('HHVM_VERSION') && version_compare(HHVM_VERSION, '3.3', '>='))) {
$extraTypes .= '|enum';
}
PhpFileCleaner::setTypeConfig(array_merge(array('class', 'interface'), array_filter(explode('|', $extraTypes))));
}
return $extraTypes;
}
} }

View File

@ -0,0 +1,228 @@
<?php
namespace Composer\Autoload;
/**
* @author Jordi Boggiano <j.boggiano@seld.be>
* @internal
*/
class PhpFileCleaner
{
/** @var array<array{name: string, length: int, pattern: string}> */
private static $typeConfig;
/** @var string */
private static $restPattern;
/**
* @readonly
* @var string
*/
private $contents;
/**
* @readonly
* @var int
*/
private $len;
/**
* @readonly
* @var int
*/
private $maxMatches;
/** @var int */
private $index = 0;
public static function setTypeConfig($types)
{
foreach ($types as $type) {
self::$typeConfig[$type[0]] = array(
'name' => $type,
'length' => \strlen($type),
'pattern' => '{.\b(?<![\$:>])'.$type.'\s++[a-zA-Z_\x7f-\xff:][a-zA-Z0-9_\x7f-\xff:\-]*+}Ais',
);
}
self::$restPattern = '{[^?"\'</'.implode('', array_keys(self::$typeConfig)).']+}A';
}
public function __construct($contents, $maxMatches)
{
$this->contents = $contents;
$this->len = \strlen($this->contents);
$this->maxMatches = $maxMatches;
}
public function clean()
{
$clean = '';
while ($this->index < $this->len) {
$this->skipToPhp();
$clean .= '<?';
while ($this->index < $this->len) {
$char = $this->contents[$this->index];
if ($char === '?' && $this->peek('>')) {
$clean .= '?>';
$this->index += 2;
continue 2;
}
if ($char === '"') {
$this->skipString('"');
$clean .= 'null';
continue;
}
if ($char === "'") {
$this->skipString("'");
$clean .= 'null';
continue;
}
if ($char === "<" && $this->peek('<') && $this->match('{<<<[ \t]*+([\'"]?)([a-zA-Z_\x80-\xff][a-zA-Z0-9_\x80-\xff]*+)\\1(?:\r\n|\n|\r)}A', $match)) {
$this->index += \strlen($match[0]);
$this->skipHeredoc($match[2]);
$clean .= 'null';
continue;
}
if ($char === '/') {
if ($this->peek('/')) {
$this->skipToNewline();
continue;
}
if ($this->peek('*')) {
$this->skipComment();
}
}
if ($this->maxMatches === 1 && isset(self::$typeConfig[$char])) {
$type = self::$typeConfig[$char];
if (
\substr($this->contents, $this->index, $type['length']) === $type['name']
&& \preg_match($type['pattern'], $this->contents, $match, 0, $this->index - 1)
) {
$clean .= $match[0];
return $clean;
}
}
$this->index += 1;
if ($this->match(self::$restPattern, $match)) {
$clean .= $char . $match[0];
$this->index += \strlen($match[0]);
} else {
$clean .= $char;
}
}
}
return $clean;
}
private function skipToPhp()
{
while ($this->index < $this->len) {
if ($this->contents[$this->index] === '<' && $this->peek('?')) {
$this->index += 2;
break;
}
$this->index += 1;
}
}
private function skipString($delimiter)
{
$this->index += 1;
while ($this->index < $this->len) {
if ($this->contents[$this->index] === '\\' && ($this->peek('\\') || $this->peek($delimiter))) {
$this->index += 2;
continue;
}
if ($this->contents[$this->index] === $delimiter) {
$this->index += 1;
break;
}
$this->index += 1;
}
}
private function skipComment()
{
$this->index += 2;
while ($this->index < $this->len) {
if ($this->contents[$this->index] === '*' && $this->peek('/')) {
$this->index += 2;
break;
}
$this->index += 1;
}
}
private function skipToNewline()
{
while ($this->index < $this->len) {
if ($this->contents[$this->index] === "\r" || $this->contents[$this->index] === "\n") {
return;
}
$this->index += 1;
}
}
private function skipHeredoc($delimiter)
{
$firstDelimiterChar = $delimiter[0];
$delimiterLength = \strlen($delimiter);
$delimiterPattern = '{'.preg_quote($delimiter).'(?![a-zA-Z0-9_\x80-\xff])}A';
while ($this->index < $this->len) {
// check if we find the delimiter after some spaces/tabs
switch ($this->contents[$this->index]) {
case "\t":
case " ":
$this->index += 1;
continue 2;
case $firstDelimiterChar:
if (
\substr($this->contents, $this->index, $delimiterLength) === $delimiter
&& $this->match($delimiterPattern)
) {
$this->index += $delimiterLength;
return;
}
break;
}
// skip the rest of the line
while ($this->index < $this->len) {
$this->skipToNewline();
// skip newlines
while ($this->index < $this->len && ($this->contents[$this->index] === "\r" || $this->contents[$this->index] === "\n")) {
$this->index += 1;
}
break;
}
}
}
private function peek($char)
{
return $this->index + 1 < $this->len && $this->contents[$this->index + 1] === $char;
}
private function match($regex, array &$match = null)
{
if (\preg_match($regex, $this->contents, $match, 0, $this->index)) {
return true;
}
return false;
}
}

View File

@ -0,0 +1,7 @@
<?php
echo <<<'NOT¶ING_TO_SEE_H¤RE'
class FailHeredocNonUnicodeNonAscii
{
}
NOT¶ING_TO_SEE_H¤RE;

View File

@ -19,6 +19,8 @@ class FailHeredocWhitespace
} }
WHITESPACE . <<< MARKERINTEXT WHITESPACE . <<< MARKERINTEXT
In PHP < 7.3, the docblock marker could occur in the text as long as it did not occur at the very start of the line. In PHP < 7.3, the docblock marker could occur in the text as long as it did not occur at the very start of the line.
MARKERINTEXTwithtrail
MARKERINTEXT_
class FailHeredocMarkerInText class FailHeredocMarkerInText
{ {
} }