Version: 1.0 (stable)
Type: Class
Category: File Management
License: GNU General Public License
Description: Page Indexer is a class for indexing all words on a web document in to a MySQL-database. It is very easy to use and VERY powerful! It will only return actual words and throw away everything else that isn't a valid word. The class works with both PHP4 and PHP5! Any comments, bugs, suggestions etc can be sent to my e-mail (which is found in the class). I would really appreciate if you sended me a e-mail if you use the class, telling me why you use it and what you think of it. Enjoy! It's a fine release =)
<?php
/**
* @name: class PageIndexer
* @file: classPageIndexer.php
* @author: Niklas Forsberg
* @email: niklas.forsberg@bildelstorget.se
* @version: 1.0
*
*
* @description:
* Class for indexing words from a specified web document and
* inserting them in to a specified MySQL-database.
*
*
* @origin:
* This class was originally written by Niklas Forsberg as a
* test project for deliverance to SailSoft.
*
*
* @usage:
* To index a web document, simply include this class in to your
* PHP-document by using require_once("classPageIndexer.php").
*
* In your PHP-document, include this line and edit the options
* to suite your needs:
*
* new PageIndexer(str URL, str MySQL-host:str MySQL-port, str username, str password, str DB-name, str silent mode)
*
* For example, if you want to put all words (that are recognized by this class)
* from the Google start page in to your MySQL-database named "webindexer", then use:
*
* $IndexPage = new PageIndexer("http://www.google.com", "localhost:3306", "myuser", "mypass", "webindexer", 0);
* (Where myuser and mypass is the login to get access to the database "webindexer" and 0 is for displaying
* all error messages and notifications that may occur (read below for more information about the silent mode)).
*
* Silent mode is a way to put all errors and announcements, which the class may produce, aside; which means that
* no errors or other messages will be printed out by the class. Simply use either 1 for running class in silent mode
* or 0 for runnning in normal mode (which means that the class will print out any errors and announcements).
*
* Directives: All directives except the port-number is needed!
* If you don't specify a port-number, the class will use the default
* port for MySQL which is "3306".
* If you don't want to use any port-number, simply remove the :nr-clause like:
* ...google.com", "localhost", "myuser...
*
*
* @support:
* PHP: The class supports PHP4> (aswell as PHP5).
* OS: Independent
* SQL: MySQL Server (at least v3.04)
* (though it can work with older releases aswell, try it out!)
*
* You also need a MySQL-database with the following table structure:
*
* CREATE TABLE `tblord` (
* `iKeyOrd` int(11) unsigned NOT NULL auto_increment,
* `sOrd` varchar(50) NOT NULL default '',
* PRIMARY KEY (`iKeyOrd`)
* ) ENGINE=MyISAM DEFAULT CHARSET=latin1;
*
* CREATE TABLE `tblsida` (
* `iKeySida` int(11) unsigned NOT NULL auto_increment,
* `sWebsida` varchar(255) default NULL,
* PRIMARY KEY (`iKeySida`)
* ) ENGINE=MyISAM DEFAULT CHARSET=latin1;
*
* CREATE TABLE `tblsida_ord` (
* `iKeySida` int(11) NOT NULL default '0',
* `iKeyOrd` int(11) NOT NULL default '0'
* ) ENGINE=MyISAM DEFAULT CHARSET=latin1;
*
*/
/**
* License:
*
* Copyright (C) 2005 Niklas Forsberg
*
* This file is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This file is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this file; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
// ****
// **** class PageIndexer
// ****
/** Class for handling methods for indexing web documents. */
class PageIndexer {
/** object pageURL
* Contains the full URL to the web document that should be indexed.
* This has to be a valid URL (like: http://www.mysite.com/news.html)
* otherwise, a error message will be returned. */
var $pageURL;
/** object silentMode
* Contains the value for running the class in silent mode or not. */
var $silentMode;
/** object pageID
* Contains the ID number of the document (URL) in the database. */
var $pageID;
/** object dbHost
* Contains the hostname of the database. */
var $dbHost;
/** object dbOpen
* Contains the connection resource for the MySQL-connection. */
var $dbOpen;
/** object dbPort
* Contains the port number to use to create a connection to the database.
* If no port number is defined, default port '3306' will be used. */
var $dbPort;
/** object dbName
* Contains the database name where we should store things. */
var $dbName;
/** object dbUser
* Contains the username to connect to the database with. */
var $dbUser;
/** object dbPass
* Contains the password for the user specified in object dbUser. */
var $dbPass;
/** object pageContents
* Containts the unformatted file contents of the file specified in object pageURL. */
var $pageContents;
/** object parsedContents
* Contains the parsed contents of the file specified in object pageURL. */
var $parsedContents;
/** object wordsIndex
* Contains the parsed words in a array element after being processed by ParsePage-method */
var $wordsIndex;
/** constructor pageIndexer(str URL, str dbhost, str dbuser, str dbpass, str dbname, int silent)
* Method which is called upon a initialisation of the class PageIndexer'.
* The contents of string $url will be passed on to method etURL for further
* management and the rest of the strings will be passed on to the DB-connection method. */
function PageIndexer($url, $db_host, $db_user, $db_pass, $db_name, $silent) {
/** Take contents of $url and put them in to object pageURL. */
$this->pageURL = $url;
/** Take contents of $silent and put them in to object silentMode. */
$this->silentMode = $silent;
/** Take contents of $db_user and put them in to object dbUser. */
$this->dbUser = $db_user;
/** Take contents of $db_pass and put them in to object dbPass. */
$this->dbPass = $db_pass;
/** Take contents of $db_name and put them in to object dbName. */
$this->dbName = $db_name;
/** Get host address and port number from var $db_host which was
* passed on from the caller. */
$host_address = explode(":", $db_host);
/** Take out the host name from array $host_address and put
* it in to it's object dbHost for future accessing. */
$this->dbHost = $host_address[0];
/** Take out the port number from array $host_address and put it in to it's
* object dbPort for future accessing. Default port number is '3306'. */
$this->dbPort = $host_address[1];
/** Call method DBConnect to make a connection to the database specified in object dbName. */
$this->DBConnect("$this->dbHost", "$this->dbPort", "$this->dbUser", "$this->dbPass", "$this->dbName");
/** Call method GetPage to get the contents of the document in object pageURL. */
$this->GetPage($this->pageURL);
}
/** method GiveMsg(str message)
* Method for returning announcements and error messages to class's methods.
* Note: Messages will only be returned if class is -not- running in silent mode. */
function GiveMsg($msg) {
/** If object silentMode is -not- set to 1, then return all messages. */
if($this->silentMode != 1) {
/** Print out error or announcement on the screen. */
print "$msg";
}
/** If object silentMode -is- set to 1, then don't return any
* messages at all to make the operations silent. */
else {
// Do not print anything out (if in silent mode)
}
}
/** method DBConnect(str hostname, str portnr, str username, str password, str dbname)
* Method for connecting to a MySQL database specified in object dbName. */
function DBConnect($db_host, $db_port, $db_user, $db_pass, $db_name) {
/** Attempt to create a connection to the MySQL-database
* using the values sent from the constructor. */
$this->dbOpen = @mysql_connect("$this->dbHost:$this->dbPort", "$this->dbUser", "$this->dbPass");
/** Attempt to select the MySQL-database specified in object dbName.
* Kill class and send a error message if the connection to the database fails. */
if( !@mysql_select_db($this->dbName) ) {
/** Make a error message to send to method GiveMsg. */
$msg = "Could not connect to database <font color=red>'" . $this->dbName . "'</font>.";
$msg .= "<br>\n";
$msg .= "<br>\n";
/** Send message to method GiveMsg. */
$this->GiveMsg($msg);
/** Kill script and return the MySQL error. */
die('Reason: ' . mysql_error());
}
}
/** method GetPage(str URL)
* Method for getting the contents of the file specified in object pageURL. */
function GetPage($url) {
/** Check if the document given in object pageURL exists or not.
* Return a error message if not. */
if( !@fopen($this->pageURL, r) ) {
/** Send message to method GiveMsg. */
$this->GiveMsg("The specified URL ('<font color=red>" . $this->pageURL . "</font>') does not exist!");
return false;
}
/** If the URL -do- exist, then proceed to parsing of the document
* by sending the object pageURLto ParsePage-method. */
else {
/** Call method ParsePage to start parsing the page in object pageURL. */
$this->ParsePage($this->pageURL);
}
}
/** method ParsePage(str URL)
* Method for parsing the contents of the file specified in object pageURL. */
function ParsePage($url) {
/** function check_word(str word)
* Function to see if the string actually is a word or not.
* A valid word could for example be:
* foo
* foo-bar
* 0123
* foo:bar */
function check_word($word) {
return (preg_match("/^[-'\/.@:;\w]+$/", $word) === 1);
}
/** function parse_word(str word)
* Function for stripping out unaccepted data from a word (ending punctuations etc)
* so a nice and correct word will be the result. Also make all words lower-case. */
function parse_word($word) {
$word = preg_replace('/^\W+/', '', strtolower($word));
$word = preg_replace('/\W+$/', '', $word);
$word = preg_replace("/[^-'\/.@:;\w]+/", '', $word);
return $word;
}
$this->pageContents = file_get_contents($this->pageURL);
/** Remove everything outside the body-element in object pageContents
* and return the results to array $body_array. */
preg_match("@
<body([^>]*)>
( (?> [^<]* )
(?> (?! </?body> ) < [^<]*)*
) </body>@imsx", $this->pageContents, $body_array);
/** Remove all javascript, CSS and other things from the body-array that
* may result in problems for the parser method. */
$remove_script = preg_replace("@
<script([^>]*)>
(
(?> [^<]* )
(?>
(?! </?script> )
< [^<]*)*)
</script>
@imsx", "", $body_array[2]);
/** Replace all tags which are stucked together with words (like word1<br>word2)
* with a blankspace so the parser can split both words in to two different words. */
$body_contents = preg_replace('/(\w)(?:<[^>]+>)+(\w)/', '\1 \2', $remove_script);
/** Decode HTML entities and strip out tags (HTML, comments etc) from document source in var $body_contents
* so that the actual words will be represented. */
$body_contents = html_entity_decode(strip_tags($body_contents));
/** Strip out any remaining &...; entities that the
* html_entity_decode()-function above could not take care of. */
$body_contents = preg_replace('@&\w+;@', '', $body_contents);
/** Fix the problem with words that are stuck together with a @ (eg a e-mail address).
* A e-mail address should count as a word aswell. */
$body_contents = preg_replace('@([a-zA-Z])/([a-zA-Z])@', '\1 \2', $body_contents);
/** Split the string $body into an array of actual words. */
$words_array = preg_split("/\s+/ms", $body_contents);
/** Apply filters using the functions check_word and parse_word
* to get the result needed and place the results in object wordsIndex. */
$filter_words = array_filter(array_map("parse_word", $words_array), "check_word");
/** Remove all duplicates of words since we only need to store
* a word once, not twice or more. So if a document contains "How do I do that"
* we have two "do"-words. Now let us reduce it to only one "do"-word. */
$this->wordsIndex = array_unique($filter_words);
/** After all words have been taken care of we should have a
* pretty nice array with parsed words. Let's pass them on to
* the IndexWords-method to get them in to the database. */
$this->IndexWords($this->wordsIndex, $this->pageURL);
}
/** method IndexWords(array words, str URL)
* Method for insertion of the parsed words in object wordsIndex. */
function IndexWords($words, $url) {
/** Check to see if the web page have been indexed before. */
$sql_return_url = mysql_query("SELECT iKeySida FROM tblsida WHERE sWebsida='$this->pageURL'");
while( $url_row = mysql_fetch_array($sql_return_url) ) {
$this->pageID = $url_row[iKeySida];
}
/** If the web document have been indexed before, then attempt to update
* the words for the web document specified in object pageURL. */
if( mysql_num_rows($sql_return_url) != 0 ) {
$this->UpdateIndex($this->pageID, $this->pageURL);
}
/** If the web document have -not- been indexed before, then attempt to
* insert a new index in the database with all indexed words in object wordsIndex'. */
else {
$this->AddIndex($this->pageURL, $this->wordsIndex);
}
}
/** method AddIndex(str URL, array words)
* Method for adding a new index to the database. */
function AddIndex($url, $words) {
/** Create a new record in the database for the web document specified in object pageURL. */
$sql_make_index = mysql_query("INSERT INTO tblsida (sWebsida) VALUES ('$this->pageURL')");
/** Get the last inserted ID number (the actual document ID). */
$new_index_id = mysql_insert_id($this->dbOpen);
/** Om $this->wordsIndex r tom, visa meddelande. */
if(!$this->wordsIndex) {
$this->GiveMsg("No words were found on the page you selected to index.");
exit;
}
/** Select the words that already exists from the database */
foreach($this->wordsIndex as $word) {
/** Add slashes to all single-quotes to prevent failure upon parsing of the words. */
$word = addslashes($word);
/** Compare the words in array wordsIndex with the words in the database. */
$sql_match_word = mysql_query("SELECT iKeyOrd, sOrd FROM tblord WHERE sOrd='$word'");
/** If one or more words are found, then only create a new connector
* between the page and the word. If no word were found, then add new word
* to the database (together with a new connector). */
if( mysql_num_rows($sql_match_word) != 0 ) {
while( $word_row = mysql_fetch_array($sql_match_word) ) {
$word_id = $word_row[iKeyOrd];
$word_old = $word_row['sOrd'];
/** The word already exists, so we only need to add a new connector/bridge
* between our document and the existing word. */
$sql_new_connector = mysql_query("INSERT INTO tblsida_ord (iKeySida, iKeyOrd) VALUES ('$new_index_id', '$word_id')");
}
}
/** If no matches were found, then add new word to the database. */
else {
/** Add the new word to the words table in database. */
$sql_new_word = mysql_query("INSERT INTO tblord (sOrd) VALUES ('$word')");
/** Get the last inserted ID number (the actual ID for the new word). */
$new_word_id = mysql_insert_id($this->dbOpen);
/** Add a new connector/bridge for the word. */
$sql_new_connector = mysql_query("INSERT INTO tblsida_ord (iKeySida, iKeyOrd) VALUES ('$new_index_id', '$new_word_id')");
}
}
/** If everything went out okay, then print out a OK-message. */
$this->GiveMsg("The document '<font color=green>" . $this->pageURL . "</font>' have been successfully indexed.");
}
/** method UpdateIndex(str pageid, str URL)
* Method for updating existing indexes in the database. */
function UpdateIndex($pid, $url) {
/** Get word connectors/bridges from the connection/bridge table in the database
* for the specified web document URL. */
$sql_return_connector = mysql_query("SELECT iKeySida, iKeyOrd FROM tblsida_ord WHERE iKeySida = $this->pageID");
while( $conn_row = mysql_fetch_array($sql_return_connector) ) {
$_page_id = $conn_row[iKeySida]; # Get existing document ID
$_word_id = $conn_row[iKeyOrd]; # Get existing word ID
/** Delete all affected connectors/bridges in the connection/bridge table. */
$sql_delete_connector = mysql_query("DELETE FROM tblsida_ord WHERE iKeySida=$_page_id AND iKeyOrd=$_word_id");
/** Return all un-used words from the database. */
$sql_return_unused = mysql_query("SELECT * FROM tblsida_ord WHERE iKeyOrd=$_word_id");
/** Check to see if there actually are any un-used words in the database.
* If that is the case, then remove them to save database space. */
if( mysql_num_rows($sql_return_unused) == 0 ) {
/** Delete all un-used words from the database. */
$sql_delete_unused = mysql_query("DELETE FROM tblord WHERE iKeyOrd=$_word_id");
}
}
/** Delete the document index from the database. */
$sql_delete_index = mysql_query("DELETE FROM tblsida WHERE iKeySida=$this->pageID");
/** Re-create a new page index in the database for the web page specified in var $url. */
$this->AddIndex($this->pageURL, $this->wordsIndex);
}
}
?>