PHP preg_replace_callback(), (?R), (?1), (?2)..., . script :
<?php
function clean_non_code(&$text) {
$re = '%
# Match and capture either CODE into $1 or non-CODE into $2.
( # $1: CODE section (never empty).
<code[^>]*> # CODE opening tag
(?R)+ # CODE contents w/nested CODE tags.
</code\s*> # CODE closing tag
) # End $1: CODE section.
| # Or...
( # $2: Non-CODE section (may be empty).
[^<]*+ # Zero or more non-< {normal*}
(?: # Begin {(special normal*)*}
(?!</?code\b) # If not a code open or close tag,
< # match non-code < {special}
[^<]*+ # More {normal*}
)*+ # End {(special normal*)*}
) # End $2: Non-CODE section
%ix';
$text = preg_replace_callback($re, '_my_callback', $text);
if ($text === null) exit('PREG Error!\nTarget string too big.');
return $text;
}
function _my_callback($matches)
{
if ($matches[1]) {
return $matches[1];
}
$matches[2] = preg_replace('/[ \t][ \t]++/S', ' ', $matches[2]);
$matches[2] = preg_replace('/^ /m', '', $matches[2]);
$matches[2] = preg_replace('/ $/m', '', $matches[2]);
return $matches[2];
}
$data = "This is some text
This is also some text
<code>
User::setup(array(
'key1' => 'value1',
'key2' => 'value1',
'key42' => '<code>
Pay no attention to this. It has been proven over and
over again that it is <code> unpossible </code>
to parse nested stuff with regex! </code>'
));
</code>";
echo("BEFORE:\n". $data ."\n\n");
echo("AFTER:\n". clean_non_code($data) ."\n\nTesting...");
$bigdata = '';
for ($i = 0; $i < 30000; ++$i) $bigdata .= $data;
$size = strlen($bigdata);
$time = microtime(true);
$bigdata = clean_non_code($bigdata);
$time = microtime(true) - $time;
printf("Done.\nTest string size: %d bytes. Time: %.3f sec. Speed: %.0f KB/s.\n",
$size, $time, ($size / $time)/1024.);
?>
script : WinXP32 PHP 5.2.14 (cli)
'Test string size: 10410000 bytes. Time: 1.219 sec. Speed: 8337 KB/s.'
, CODE, <> (, ), . , ( , CODE .)
p.s. SO. <!-- language: lang-none --> .