Version: 0.90b
Type: Function
Category: Algorithms
License: GNU General Public License
Description: Actually this code parses every formatted text tagged with < & >. Look at the code for some explanation. It heavily updated at the moment (I need it as an OFX parser, more on that later), so it might be worth to check it out. Probably arguments wont work, because I have not yet tested it (I do not need it when pasring OFX)
<?
// This code has been published under GPL version 2 or higher
// so you are free to modify etc etc
// Please drop me a line when you use it in your code or
// when you modify it
// I will be making changes to it and include more comments
// bolke@xs4all.nl
// btw NO WARRANTIES
// function parsehtml
// based on the libhtmlparse libary by
// Mooneer Salem (mooneer@translator.cx)
// should be rewritten to be an extension to php4
// just a debug print function
// cause tags aren't that easy to output
// Version 0.90b
function dp($txt, $var)
{
$trans = get_html_translation_table(HTML_ENTITIES);
$encoded = strtr($var, $trans);
printf("<br>%s is now: %s", $txt, $encoded);
}
function parse($html) {
$html2 = $html;
$tmp = $html2;
$c=0;
while (($tmp[$c] || $c < strlen($tmp))) {
if ($tmp[$c] == '<' || $istag) {
$istag = 0;
if ($tmp[$c++] == '!') {
// comment
if ($tmp[$c++] == '-' || $tmp[$c+1] == '-') {
$c+=2;
// trim($tmp) ??
while ($tmp[$c] == ' ') $c++;
$c--;
startCommentCallback();
// find the end of the comment
$t = $c;
while (($tmp[$t] || $t < strlen($tmp)) && !($tmp[$t] == '-' && $tmp[$t++] == '-' && $tmp[$t+2] == '>')) $t++;
if (($tmp[$t] || $t < strlen($tmp))) {
while ($tmp[$t] == ' ') $t--;
$comment = substr($tmp, $c, $t-$c);
while ($tmp[$t] == ' ') $t++;
$t+=2;
$c = $t;
}
commentCallback($comment);
endCommentCallback();
$c++;
} else {
$c--;
// handle starttag
}
} elseif ($tmp[$c] == '/' || $tmp[$c-1] == '/') {
if ($tmp[$c] == '/') $c++;
$t=$c;
while (($tmp[$t] || $t < strlen($tmp)) && $tmp[$t] != '>') $t++;
$tag = substr($tmp,$c,$t-$c);
endCallback($tag);
$t++;
$c = $t;
continue;
} else {
// starttag here
// maybe inseperate function because of above
if ($tmp[$c-1] != '<') $c--;
// comment function from above in seperate function call.
if ($tmp[$c] == '!' && $tmp[$c++] == '-') comment();
$t = $c;
$q = $c; // $q belongs to $tag[$q]
$tagstart = $c;
$tag = substr($tmp,$c);
while ($tmp[$t] != '>' && $tmp[$t] != ' ') $t++;
if ($tmp[$t] == '>') {
$tag = substr($tmp, $c, $t-$c);
if ($tag[0] == '!') {
$tag = substr($tmp,$c+1, $t-($c+1));
$tagstart = $c+1;
declCallback($tag, "", 0);
} else startCallback($tag, "", 0);
// TEST was c = t
$c = $t+1;
continue;
} elseif ($tmp[$c] == ' ') {
while ($tmp[$c] == ' ') $c++;
} else {
if ($tmp[$q] == '!') {
$q++;
$tag = substr($tmp, $q, $q-$c);
declCallback ($tag, "", 0);
} else {
declCallback ($tag, "", 0);
}
break;
}
unset($args);
$numargs = 0;
while (($tmp[$c] || $c < strlen($tmp))) {
$istrue = 0;
$tagended = 0;
while ($tmp[$c] == ' ') $c++;
if (!$tmp[$c-1] == ' ') $c--;
// $arg = start of argument
$arg = $c;
// $q is used to find end of argument
if ($tmp[$arg] == '"' || $tmp[$arg] == '\'') {
$c++;
$arg = $c;
while (($tmp[$c] || $c < strlen($tmp)) && !($tmp[$c] == '"' && $tmp[$c-1] != '\\' && $tmp[$c] != '\'')) $c++;
// add arguments to table?
if ($tmp[$c] != '>') continue;
if ($tmp[$c+1] == '>') {
$c++;
}
break;
}
$val = "";
while ($tmp[$c] != '=' && $tmp[$c] != ' ' && $tmp[$c] != '>') $c++;
if ($tmp[$c] != ' ' && $tmp[$c] != '>') $istrue = 1;
if ($tmp[$c] == '>') $tagended = 1;
$q = $c;
$c++;
if ($istrue) {
if ($tmp[$c] != '\'' && $tmp[$c] != '"') {
while ($tmp[$c] != ' ' && $tmp[$c] != '>') $c++;
if ($tmp[$c] == '>') {
$val = substr($tmp,$q, $c-$q);
} else {
$c++;
$val = substr($tmp, $c, $c-$q);
continue;
}
} else {
$c++;
while ($tmp[$c] && ($tmp[$c] != '\'' || ($tmp[$c] == '\'' && $tmp[$c-1] == '\\')) && ($tmp[$c] != '"' || ($tmp[$c] == '"' && $tmp[$c-1] == '\\'))) $c++;
if ($tmp[$c] == '>') {
$val = substr($tmp,$q, $c-$q);
$c++;
// add args
break;
} elseif ($tmp[$c+1] == '>') {
$val = substr($tmp, $q, $c-$q);
$c++;
// add args
break;
} else {
$val = substr($tmp, $q, $c-$q);
$c+=2;
// add args
}
}
} else {
// add args
if (!$tagended) continue;
$tagended = 0;
$c--;
break;
}
}
// is q allowed here?
$q=0;
if ($tag[$q] == '!') {
$q++;
// FIXME
$tag = substr($tag, $q);
// FIX THIS TOO
declCallback($tag, "", 0);
} else {
// and this
startCallback($tag, "", 0);
}
// clear arg list;
$c++;
continue;
}
} else {
// check for newline char
if ($tmp[$c] == '\n') {
$c++;
continue;
}
$text = $tmp;
$q = $c;
if ($text[$q] == '!') {
$q--;
if ($text[$q-1] == '<') {
$q--;
continue;
}
}
while ($tmp[$c] == ' ' && $tmp[$c] != '<' && ($tmp[$c] || $c < strlen($tmp))) $c++;
if ($tmp[$c] == '<' && $tmp[$c+1]) {
continue;
} else if (!($tmp[$c] || $c < strlen($tmp))) break;
// text start
textStartCallback();
for (;;) {
while (($tmp[$c] || $c < strlen($tmp)) && $tmp[$c] != '<') $c++;
if ($tmp[$c] == '<') {
if ($tmp[$c+1] == ' ') {
$c++;
continue;
} else $istag = 1;
}
break;
}
$text = substr($tmp, $q, $c-$q);
// text callback
textCallback($text);
// text end
textEndCallback();
$c++;
continue;
}
}
return;
}
?>