PHPBuilder - HTML parser



RSS Twitter
Snippets Algorithms

HTML parser

by: Bolke de Bruin
|
August 20, 2000

Version: 0.90b

Type: Function

Category: Algorithms

License: GNU General Public License

Description: Actually this code parses every formatted text tagged with < & >. Look at the code for some explanation. It heavily updated at the moment (I need it as an OFX parser, more on that later), so it might be worth to check it out. Probably arguments wont work, because I have not yet tested it (I do not need it when pasring OFX)



<?
// This code has been published under GPL version 2 or higher
// so you are free to modify etc etc
// Please drop me a line when you use it in your code or
// when you modify it
// I will be making changes to it and include more comments
// bolke@xs4all.nl
// btw NO WARRANTIES

// function parsehtml
// based on the libhtmlparse libary by
// Mooneer Salem (mooneer@translator.cx)
// should be rewritten to be an extension to php4

// just a debug print function
// cause tags aren't that easy to output

// Version 0.90b

function dp($txt, $var)
{
	$trans = get_html_translation_table(HTML_ENTITIES);
	$encoded = strtr($var, $trans);
	printf("<br>%s is now: %s", $txt, $encoded);
}

function parse($html) {
	$html2 = $html;
	$tmp = $html2;
	
	$c=0;
	
	while (($tmp[$c] || $c < strlen($tmp))) {

		if ($tmp[$c] == '<' || $istag) {
			$istag = 0;
			
			if ($tmp[$c++] == '!') {
				// comment
				if ($tmp[$c++] == '-' || $tmp[$c+1] == '-') {
					$c+=2;
					// trim($tmp) ??
					while ($tmp[$c] == ' ') $c++;
					$c--;
					
					
					startCommentCallback();
					
					// find the end of the comment
					$t = $c;
					while (($tmp[$t] || $t < strlen($tmp)) && !($tmp[$t] == '-' && $tmp[$t++] == '-' && $tmp[$t+2] == '>')) $t++;
					if (($tmp[$t] || $t < strlen($tmp))) {
						while ($tmp[$t] == ' ') $t--;
						$comment = substr($tmp, $c, $t-$c);
						while ($tmp[$t] == ' ') $t++;
						$t+=2;
						$c = $t;
					}
					
					commentCallback($comment);
					endCommentCallback();
					
					$c++;
				} else {
					$c--;
					// handle starttag
				}
			} elseif ($tmp[$c] == '/' || $tmp[$c-1] == '/') {
				if ($tmp[$c] == '/') $c++;
				$t=$c;
				while (($tmp[$t] || $t < strlen($tmp)) && $tmp[$t] != '>') $t++;
				
				$tag = substr($tmp,$c,$t-$c);
				endCallback($tag);
				$t++;
				$c = $t;
				continue;
			} else {
				// starttag here
				// maybe inseperate function because of above
				if ($tmp[$c-1] != '<') $c--;
				
				// comment function from above in seperate function call.
				if ($tmp[$c] == '!' && $tmp[$c++] == '-') comment();
			
				$t = $c;
				$q = $c;    // $q belongs to $tag[$q]
				$tagstart = $c;
					
				$tag = substr($tmp,$c);

				while ($tmp[$t] != '>' && $tmp[$t] != ' ') $t++;
				if ($tmp[$t] == '>') {
					$tag = substr($tmp, $c, $t-$c);
					if ($tag[0] == '!') {
						$tag = substr($tmp,$c+1, $t-($c+1));
						$tagstart = $c+1;
						declCallback($tag, "", 0);
					} else startCallback($tag, "", 0);
					
					// TEST was c = t
					$c = $t+1;
					
					continue;
				} elseif ($tmp[$c] == ' ') {
					while ($tmp[$c] == ' ') $c++;
				} else {
					if ($tmp[$q] == '!') {
						$q++;
						$tag = substr($tmp, $q, $q-$c);
						declCallback ($tag, "", 0);
					} else {
						declCallback ($tag, "", 0);
					}
					break;
				}
				
				unset($args);
				$numargs = 0;
				
				while (($tmp[$c] || $c < strlen($tmp))) {
					$istrue = 0;
					$tagended = 0;
					while ($tmp[$c] == ' ') $c++;
					if (!$tmp[$c-1] == ' ') $c--;
					
					// $arg = start of argument
					$arg = $c;
							
					// $q is used to find end of argument
					if ($tmp[$arg] == '"' || $tmp[$arg] == '\'') {
						$c++;
						$arg = $c;
						while (($tmp[$c] || $c < strlen($tmp)) && !($tmp[$c] == '"' && $tmp[$c-1] != '\\' && $tmp[$c] != '\'')) $c++;
						// add arguments to table?
						
						if ($tmp[$c] != '>') continue;
						if ($tmp[$c+1] == '>') {
							$c++;
						}
						break;
					}
					
					$val = "";
					while ($tmp[$c] != '=' && $tmp[$c] != ' ' && $tmp[$c] != '>') $c++;
					if ($tmp[$c] != ' ' && $tmp[$c] != '>') $istrue = 1;
					if ($tmp[$c] == '>') $tagended = 1;
					
					$q = $c;
					$c++;
					
					if ($istrue) {
						if ($tmp[$c] != '\'' && $tmp[$c] != '"') {
							while ($tmp[$c] != ' ' && $tmp[$c] != '>') $c++;
							if ($tmp[$c] == '>') {
								$val = substr($tmp,$q, $c-$q);
							} else {
								$c++;
								$val = substr($tmp, $c, $c-$q);
								continue;
							}
						} else {
							$c++;
							while ($tmp[$c] && ($tmp[$c] != '\'' || ($tmp[$c] == '\'' && $tmp[$c-1] == '\\')) && ($tmp[$c] != '"' || ($tmp[$c] == '"' && $tmp[$c-1] == '\\'))) $c++;
							if ($tmp[$c] == '>') {
								$val = substr($tmp,$q, $c-$q);
								$c++;
								// add args
								break;
								
							} elseif ($tmp[$c+1] == '>') {
								$val = substr($tmp, $q, $c-$q);
								$c++;
								// add args
								break;
							} else {
								$val = substr($tmp, $q, $c-$q);
								$c+=2;
								// add args
							}
						}
					} else {
						// add args
						if (!$tagended) continue;
						$tagended = 0;
						$c--;
						break;
					}
				}
				
				// is q allowed here?
				$q=0;
				if ($tag[$q] == '!') {
					$q++;
					// FIXME
					$tag = substr($tag, $q);
					// FIX THIS TOO
					declCallback($tag, "", 0);
				} else {
					// and this
					startCallback($tag, "", 0);
				}
				
				// clear arg list;
				$c++;
				continue;
			}
		} else {
			// check for newline char						
			if ($tmp[$c] == '\n') {
				$c++;
				continue;
			}
			
			$text = $tmp;
			$q = $c;
			
			if ($text[$q] == '!') {
				$q--;
				if ($text[$q-1] == '<') {
					$q--;
					continue;
				}
			}				
			
			while ($tmp[$c] == ' ' && $tmp[$c] != '<' && ($tmp[$c] || $c < strlen($tmp))) $c++;
			if ($tmp[$c] == '<' && $tmp[$c+1]) {
				continue;
			} else if (!($tmp[$c] || $c < strlen($tmp))) break;
			
			// text start
			textStartCallback();
			
			for (;;) {
				while (($tmp[$c] || $c < strlen($tmp)) && $tmp[$c] != '<') $c++;
				if ($tmp[$c] == '<') {
					if ($tmp[$c+1] == ' ') {
						$c++;
						continue;
					} else $istag = 1;
				}
				break;
			}
				
			$text = substr($tmp, $q, $c-$q);
			
			// text callback
			textCallback($text);
			
			// text end
			textEndCallback();
			$c++;
			continue;
			
			}
		
		}
				
		return;
}
	
?>															

Comment and Contribute

Your comment has been submitted and is pending approval.

Author:
Bolke de Bruin

Comment:



Comment:

(Maximum characters: 1200). You have characters left.