From df54af4f6bf7afa6b89f5fa1f07177313aa5cc1f Mon Sep 17 00:00:00 2001
From: Harishankar <v.harishankar@gmail.com>
Date: Wed, 1 Dec 2010 14:16:59 +0530
Subject: [PATCH] Implemented the search database exporting

Implemented the search database exporting functionality.

TODO: the RSS feed functionality and the exporter will
be complete
---
 biaweb_exporter.py | 96 +++++++++++++++++++++++++++++++++++++++++++++-
 biaweb_strings.py  |  7 +++-
 2 files changed, 100 insertions(+), 3 deletions(-)

diff --git a/biaweb_exporter.py b/biaweb_exporter.py
index 4560181..47ccead 100644
--- a/biaweb_exporter.py
+++ b/biaweb_exporter.py
@@ -8,7 +8,29 @@ import time
 import sqlite3
 import string
 import shutil
+import HTMLParser
+import cPickle
 import biaweb_db
+import biaweb_strings
+
+# class to remove HTML tags from a string - used by the search
+# generator to get an article without HTML tags
+class HTMLTagRemover (HTMLParser.HTMLParser):
+	def __init__ (self):
+		HTMLParser.HTMLParser.__init__ (self)
+		# initialize the list for data - this will be converted
+		# to a string before returning
+		self.data_list = []
+
+	# This event gets the data of the string that is anything that
+	# is not a HTML tag
+	def handle_data (self, data):
+		self.data_list.append (data)
+
+	# return the string thus collected by the handle_data event
+	def get_raw_string (self):
+		raw_string = "".join (self.data_list)
+		return raw_string
 
 # to format the best rated articles in a HTML link list
 def html_format_best_rated (best_rated):
@@ -49,6 +71,73 @@ def html_format_rating (rating):
 	rating_str = "".join (items)
 	return rating_str
 
+# function to generate the search index file
+def generate_search_index (dbname, conf, full_text_index = True):
+	# get all the articles
+	arts = biaweb_db.site_articles (dbname)
+	# if cannot get articles
+	if arts == False:
+		return False
+
+	# if full text index, then field should be article content otherwise keywords
+	if full_text_index:
+		searchfield = 4
+	else:
+		searchfield = 3
+
+	# initialize the search index dictionary
+	search_index = dict ()
+
+	# now run through the articles and generate a table of unique words (except
+	# stop words)
+	for art in arts:
+		# now strip out the HTML tags from the articles
+		parser = HTMLTagRemover ()
+		parser.feed (art[searchfield])
+		parser.close ()
+		# get the word list
+		word_list = parser.get_raw_string ().split ()
+
+		# now run through each word, make it lowercase, remove all cruft from it
+		# and add it to a dictionary
+		for word in word_list:
+			cleanword = word.strip (":;?!_<>,.+-\"'=`!@#$%^&*()[]{}/= \n\r\t").lower ()
+			# if the word is not a "stop word", then add it to the search database
+			if cleanword not in biaweb_strings.stopwords:
+				# title of the search entry should be the article title
+				title = art[1]
+				# url should be the article URL: http://siteurl/Category/Article.html
+				url = "http://" + conf[0] + art[13] + "/" + art[8] + ".html"
+				# if search index has the word (as key)
+				if search_index.has_key (cleanword):
+					# add the title and url as a tuple to the set
+					search_index[cleanword].add ((title, url))
+				# create the key as the word
+				else:
+					# create a set for the keyword. Set will hold the tuples
+					# representing article title and url
+					search_index[cleanword] = set ()
+					search_index[cleanword].add ((title, url))
+
+	# done now write the search database as a python pickle object of search_index
+	search_index_path = os.path.join (conf[5], "cgi-bin", "searchwords.idx")
+	htaccess_path = os.path.join (conf[5], "cgi-bin", ".htaccess")
+	try:
+		# open the file in write binary mode
+		fsearchindex = open (search_index_path, "wb")
+		# dump the dictionary as a pickle object in binary mode
+		cPickle.dump (search_index, fsearchindex, 2)
+		fsearchindex.close ()
+		# write the htaccess file to prevent opening the index file from web browser
+		fhtaccess = open (htaccess_path, "w+")
+		fhtaccess.write (biaweb_strings.searchindex_htaccess)
+		fhtaccess.close ()
+	except OSError, IOError:
+		return False
+
+	# finally return true
+	return True
+
 # function to copy additional files and folders to the destination path
 def copy_files_folders (conf, files_to_copy, folders_to_copy):
 	# create the cgi-bin directory and try to copy search.py into the destination directory if possible
@@ -104,8 +193,6 @@ def copy_files_folders (conf, files_to_copy, folders_to_copy):
 	# finally return true
 	return True
 
-
-
 # function to generate article pages
 def generate_article_pages (dbname, conf, templates, category_str, bestrated_str):
 	# main template
@@ -336,5 +423,10 @@ def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True
 	if ret == False:
 		return False
 
+	# now generate the search index database
+	ret = generate_search_index (dbname, conf, search_type_full)
+	if ret == False:
+		return False
+
 	# finally when all is successfully done return true
 	return True
diff --git a/biaweb_strings.py b/biaweb_strings.py
index 30b9f2c..8f7f232 100644
--- a/biaweb_strings.py
+++ b/biaweb_strings.py
@@ -450,4 +450,9 @@ yes
 yet
 you
 your
-z""".split ("\n")
\ No newline at end of file
+z""".split ("\n")
+
+searchindex_htaccess = """<Files searchwords.idx>
+	order allow,deny
+	deny from all
+</Files>"""
\ No newline at end of file
-- 
2.20.1