From: Harishankar Date: Wed, 1 Dec 2010 08:46:59 +0000 (+0530) Subject: Implemented the search database exporting X-Git-Url: https://harishankar.org/repos/?a=commitdiff_plain;h=df54af4f6bf7afa6b89f5fa1f07177313aa5cc1f;p=biaweb_qt.git Implemented the search database exporting Implemented the search database exporting functionality. TODO: the RSS feed functionality and the exporter will be complete --- diff --git a/biaweb_exporter.py b/biaweb_exporter.py index 4560181..47ccead 100644 --- a/biaweb_exporter.py +++ b/biaweb_exporter.py @@ -8,7 +8,29 @@ import time import sqlite3 import string import shutil +import HTMLParser +import cPickle import biaweb_db +import biaweb_strings + +# class to remove HTML tags from a string - used by the search +# generator to get an article without HTML tags +class HTMLTagRemover (HTMLParser.HTMLParser): + def __init__ (self): + HTMLParser.HTMLParser.__init__ (self) + # initialize the list for data - this will be converted + # to a string before returning + self.data_list = [] + + # This event gets the data of the string that is anything that + # is not a HTML tag + def handle_data (self, data): + self.data_list.append (data) + + # return the string thus collected by the handle_data event + def get_raw_string (self): + raw_string = "".join (self.data_list) + return raw_string # to format the best rated articles in a HTML link list def html_format_best_rated (best_rated): @@ -49,6 +71,73 @@ def html_format_rating (rating): rating_str = "".join (items) return rating_str +# function to generate the search index file +def generate_search_index (dbname, conf, full_text_index = True): + # get all the articles + arts = biaweb_db.site_articles (dbname) + # if cannot get articles + if arts == False: + return False + + # if full text index, then field should be article content otherwise keywords + if full_text_index: + searchfield = 4 + else: + searchfield = 3 + + # initialize the search index dictionary + search_index = dict () + + # now run through the articles and generate a table of unique words (except + # stop words) + for art in arts: + # now strip out the HTML tags from the articles + parser = HTMLTagRemover () + parser.feed (art[searchfield]) + parser.close () + # get the word list + word_list = parser.get_raw_string ().split () + + # now run through each word, make it lowercase, remove all cruft from it + # and add it to a dictionary + for word in word_list: + cleanword = word.strip (":;?!_<>,.+-\"'=`!@#$%^&*()[]{}/= \n\r\t").lower () + # if the word is not a "stop word", then add it to the search database + if cleanword not in biaweb_strings.stopwords: + # title of the search entry should be the article title + title = art[1] + # url should be the article URL: http://siteurl/Category/Article.html + url = "http://" + conf[0] + art[13] + "/" + art[8] + ".html" + # if search index has the word (as key) + if search_index.has_key (cleanword): + # add the title and url as a tuple to the set + search_index[cleanword].add ((title, url)) + # create the key as the word + else: + # create a set for the keyword. Set will hold the tuples + # representing article title and url + search_index[cleanword] = set () + search_index[cleanword].add ((title, url)) + + # done now write the search database as a python pickle object of search_index + search_index_path = os.path.join (conf[5], "cgi-bin", "searchwords.idx") + htaccess_path = os.path.join (conf[5], "cgi-bin", ".htaccess") + try: + # open the file in write binary mode + fsearchindex = open (search_index_path, "wb") + # dump the dictionary as a pickle object in binary mode + cPickle.dump (search_index, fsearchindex, 2) + fsearchindex.close () + # write the htaccess file to prevent opening the index file from web browser + fhtaccess = open (htaccess_path, "w+") + fhtaccess.write (biaweb_strings.searchindex_htaccess) + fhtaccess.close () + except OSError, IOError: + return False + + # finally return true + return True + # function to copy additional files and folders to the destination path def copy_files_folders (conf, files_to_copy, folders_to_copy): # create the cgi-bin directory and try to copy search.py into the destination directory if possible @@ -104,8 +193,6 @@ def copy_files_folders (conf, files_to_copy, folders_to_copy): # finally return true return True - - # function to generate article pages def generate_article_pages (dbname, conf, templates, category_str, bestrated_str): # main template @@ -336,5 +423,10 @@ def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True if ret == False: return False + # now generate the search index database + ret = generate_search_index (dbname, conf, search_type_full) + if ret == False: + return False + # finally when all is successfully done return true return True diff --git a/biaweb_strings.py b/biaweb_strings.py index 30b9f2c..8f7f232 100644 --- a/biaweb_strings.py +++ b/biaweb_strings.py @@ -450,4 +450,9 @@ yes yet you your -z""".split ("\n") \ No newline at end of file +z""".split ("\n") + +searchindex_htaccess = """ + order allow,deny + deny from all +""" \ No newline at end of file