X-Git-Url: https://harishankar.org/repos/?p=biaweb_qt.git;a=blobdiff_plain;f=biaweb_exporter.py;h=6cba7065096e61c2ab96244d5ab963de2df9b89f;hp=7c887c6c26642b099e63a9c93ba29d41e253f9ba;hb=HEAD;hpb=de54d2eb3b283030bd98f7a1f6d056ce2e56b4d7

diff --git a/biaweb_exporter.py b/biaweb_exporter.py
index 7c887c6..6cba706 100644
--- a/biaweb_exporter.py
+++ b/biaweb_exporter.py
@@ -3,11 +3,34 @@
 
 import os
 import os.path
+import sys
 import time
 import sqlite3
 import string
 import shutil
+import HTMLParser
+import cPickle
 import biaweb_db
+import biaweb_strings
+
+# class to remove HTML tags from a string - used by the search
+# generator to get an article without HTML tags
+class HTMLTagRemover (HTMLParser.HTMLParser):
+	def __init__ (self):
+		HTMLParser.HTMLParser.__init__ (self)
+		# initialize the list for data - this will be converted
+		# to a string before returning
+		self.data_list = []
+
+	# This event gets the data of the string that is anything that
+	# is not a HTML tag
+	def handle_data (self, data):
+		self.data_list.append (data)
+
+	# return the string thus collected by the handle_data event
+	def get_raw_string (self):
+		raw_string = "".join (self.data_list)
+		return raw_string
 
 # to format the best rated articles in a HTML link list
 def html_format_best_rated (best_rated):
@@ -48,6 +71,174 @@ def html_format_rating (rating):
 	rating_str = "".join (items)
 	return rating_str
 
+# function to generate the search index file
+def generate_search_index (dbname, conf, full_text_index = True):
+	# get all the articles
+	arts = biaweb_db.site_articles (dbname)
+	# if cannot get articles
+	if arts is False:
+		return False
+
+	# if full text index, then field should be article content otherwise keywords
+	if full_text_index:
+		searchfield = 4
+	else:
+		searchfield = 3
+
+	# initialize the search index dictionary
+	search_index = dict ()
+
+	# now run through the articles and generate a table of unique words (except
+	# stop words)
+	for art in arts:
+		# now strip out the HTML tags from the articles
+		parser = HTMLTagRemover ()
+		parser.feed (art[searchfield])
+		parser.close ()
+		# get the word list
+		word_list = parser.get_raw_string ().split ()
+
+		# now run through each word, make it lowercase, remove all cruft from it
+		# and add it to a dictionary
+		for word in word_list:
+			cleanword = word.strip (":;?!_<>,.+-\"'=`!@#$%^&*()[]{}/= \n\r\t").lower ()
+			# if the word is not a "stop word", then add it to the search database
+			if cleanword not in biaweb_strings.stopwords:
+				# title of the search entry should be the article title
+				title = art[1]
+				# url should be the article URL: http://siteurl/Category/Article.html
+				url = "http://" + conf[0] + art[13] + "/" + art[8] + ".html"
+				# if search index has the word (as key)
+				if search_index.has_key (cleanword):
+					# add the title and url as a tuple to the set
+					search_index[cleanword].add ((title, url))
+				# create the key as the word
+				else:
+					# create a set for the keyword. Set will hold the tuples
+					# representing article title and url
+					search_index[cleanword] = set ()
+					search_index[cleanword].add ((title, url))
+
+	# done now write the search database as a python pickle object of search_index
+	search_index_path = os.path.join (conf[5], "cgi-bin", "searchwords.idx")
+	htaccess_path = os.path.join (conf[5], "cgi-bin", ".htaccess")
+	try:
+		# open the file in write binary mode
+		fsearchindex = open (search_index_path, "wb")
+		# dump the dictionary as a pickle object in binary mode
+		cPickle.dump (search_index, fsearchindex, 2)
+		fsearchindex.close ()
+		# write the htaccess file to prevent opening the index file from web browser
+		fhtaccess = open (htaccess_path, "w+")
+		fhtaccess.write (biaweb_strings.searchindex_htaccess)
+		fhtaccess.close ()
+	except (OSError, IOError):
+		return False
+
+	# finally return true
+	return True
+
+# function to copy additional files and folders to the destination path
+def copy_files_folders (conf, files_to_copy, folders_to_copy):
+	# create the cgi-bin directory and try to copy search.py into the destination directory if possible
+	# otherwise user must copy it manually
+	search_script_path = os.path.join (sys.path[0], "search.py")
+	if os.path.exists (search_script_path):
+		try:
+			os.mkdir (os.path.join (conf[5], "cgi-bin"))
+			shutil.copy2 (search_script_path, os.path.join(conf[5], "cgi-bin"))
+		except (IOError, OSError):
+			return False
+
+	# try to copy the star rating images  to destination directory if possible
+	# otherwise user must copy it manually
+	rating_img_star = os.path.join (sys.path[0], "star.gif")
+	rating_img_greystar = os.path.join (sys.path[0], "star-grey.gif")
+	if os.path.exists (rating_img_star):
+		try:
+			shutil.copy2 (rating_img_star, conf[5])
+		except (IOError, OSError):
+			return False
+	if os.path.exists (rating_img_greystar):
+		try:
+			shutil.copy2 (rating_img_greystar, conf[5])
+		except (IOError, OSError):
+			return False
+
+	# additional files to copy
+
+	# first copy files
+	# check if files to copy is not empty
+	if files_to_copy <> []:
+		for src, dest in files_to_copy:
+			# get full path from relative path in dest
+			full_dest = os.path.join (conf[5], dest)
+			try:
+				shutil.copy2 (src, full_dest)
+			except (IOError, OSError):
+				return False
+
+	# additional folders to copy
+
+	# now copy the folders
+	if folders_to_copy <> []:
+		for src, dest in folders_to_copy:
+			# get full path from relative path in dest
+			full_dest = os.path.join (conf[5], dest)
+			try:
+				shutil.copytree (src, full_dest)
+			except (IOError, OSError):
+				return False
+
+	# finally return true
+	return True
+
+# function to generate article pages
+def generate_article_pages (dbname, conf, templates, category_str, bestrated_str):
+	# main template
+	tpl_main = string.Template (templates[0][1])
+	# article template
+	tpl_articlebit = string.Template (templates[1][1])
+
+	# get all articles from the database
+	articles = biaweb_db.site_articles (dbname)
+	if articles is False:
+		return
+
+	# walk through each article and generate the file in the appropriate category
+	# folder
+	for art in articles:
+		art_cdate = time.ctime (art[5])
+		art_mdate = time.ctime (art[6])
+		rating_str = html_format_rating (art[9])
+		# now build the article from the article bit template
+		article_str = tpl_articlebit.safe_substitute (article_title = art[1],
+													article_cdate = art_cdate,
+													article_mdate = art_mdate,
+													rating = rating_str,
+													article_contents = art[4])
+
+		# now build the article page
+		articlepage_str = tpl_main.safe_substitute (site_title = art[1],
+													site_url = "http://" + conf[0],
+													meta_keywords = art[3],
+													meta_description = art[2],
+													page_title = conf[1],
+													page_desc = conf[3],
+													contents_bit = article_str,
+													list_of_categories = category_str,
+													list_best_rated = bestrated_str,
+													copyright = conf[6])
+		# write to the article file
+		try:
+			farticle = open (os.path.join (conf[5], art[13], art[8] + ".html"), "w+")
+			farticle.write (articlepage_str)
+		except (OSError, IOError):
+			return False
+
+	# finally return true
+	return True
+
 # function to generate category directories and indices
 def generate_category_indices (dbname, conf, templates, category_str, bestrated_str, category_list):
 	# main template
@@ -62,12 +253,12 @@ def generate_category_indices (dbname, conf, templates, category_str, bestrated_
 		try:
 			# create the category directory
 			os.mkdir (os.path.join (conf[5], cat[3]))
-		except IOError, OSError:
+		except (IOError, OSError):
 			return False
 
 		# now get the list of articles for the specified category
 		articles_list = biaweb_db.site_articles (dbname, cat[0])
-		if articles_list == False:
+		if articles_list is False:
 			return False
 
 		tableitems = []
@@ -91,7 +282,7 @@ def generate_category_indices (dbname, conf, templates, category_str, bestrated_
 														table_rows = tablerows_str)
 
 		# now create the index page
-		categoryindex_str =  tpl_main.safe_substitute (site_title = conf[1],
+		categoryindex_str =  tpl_main.safe_substitute (site_title = conf[1] + " - " + cat[1],
 														site_url = "http://" + conf[0],
 														meta_keywords = conf[2],
 														meta_description = cat[2],
@@ -107,12 +298,53 @@ def generate_category_indices (dbname, conf, templates, category_str, bestrated_
 			fcatindex = open (os.path.join (conf[5], cat[3], "index.html"), "w+")
 			fcatindex.write (categoryindex_str)
 			fcatindex.close ()
-		except OSError, IOError:
+		except (OSError, IOError):
 			return False
 
 	# finally return true
 	return True
 
+# function to generate the RSS feed for the website
+def generate_rss_feed (dbname, conf):
+	# rss main template
+	tpl_rss = string.Template (biaweb_strings.template_rss)
+	# rss item bit template
+	tpl_rss_itembit = string.Template (biaweb_strings.template_rss_item)
+
+	# get the latest articles (limit by number of rss items)
+	arts = biaweb_db.site_latest_articles (dbname, conf[4])
+	if arts is False:
+		return False
+
+	rss_item_list = []
+	# run through the articles and generate the rss items
+	for art in arts:
+		# link
+		itemlink = "http://" + conf[0] + art[13] + "/" + art[8] + ".html"
+		item_str = tpl_rss_itembit.safe_substitute (item_title = art[1],
+													item_link = itemlink,
+													description = art[2])
+		rss_item_list.append (item_str)
+
+	# now get the rss items as a string
+	rss_item_str = "".join (rss_item_list)
+
+	# now generate the feed
+	rss_str = tpl_rss.safe_substitute (title = conf[1],
+										link = "http://" + conf[0],
+										description = conf[3],
+										rss_items = rss_item_str)
+
+	# now try to write it to the rss file
+	try:
+		frss = open (os.path.join (conf[5], "subscribe.xml"), "w+")
+		frss.write (rss_str)
+	except (IOError, OSError):
+		return False
+
+	# finally return true
+	return True
+
 # function to generate main index file and stylesheet
 def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
 	# main template
@@ -124,7 +356,7 @@ def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
 
 	# get the latest articles - conf[4] is num of rss entries to be used also
 	latest_arts = biaweb_db.site_latest_articles (dbname, conf[4])
-	if latest_arts == False:
+	if latest_arts is False:
 		return False
 
 	news_items = []
@@ -167,7 +399,7 @@ def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
 		findex = open (os.path.join (conf[5], "index.html"), "w+")
 		findex.write (main_str)
 		findex.close ()
-	except IOError, OSError:
+	except (IOError, OSError):
 		return False
 
 	# write the style.css file in the destination directory
@@ -175,7 +407,7 @@ def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
 		fstyle = open (os.path.join (conf[5], "style.css"), "w+")
 		fstyle.write (templates[5][1])
 		fstyle.close ()
-	except IOError, OSError:
+	except (IOError, OSError):
 		return False
 
 	return True
@@ -184,13 +416,19 @@ def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
 def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True):
 	# get the configuration
 	conf = biaweb_db.get_configuration (dbname)
+	# if cannot get configuration
+	if conf is False:
+		return False
+
 	# get the templates
 	tpls = biaweb_db.get_templates (dbname)
+	if tpls is False:
+		return False
 
 	# get the list of categories
 	cats = biaweb_db.get_categories (dbname)
 	# cannot get categories return false
-	if cats == False:
+	if cats is False:
 		return False
 
 	# format the categories as a html bulleted list
@@ -199,7 +437,7 @@ def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True
 	# get the best rated articles
 	best_rated = biaweb_db.site_get_bestrated (dbname)
 	# if cannot retrieve
-	if best_rated == False:
+	if best_rated is False:
 		return False
 	# format the best rated articles as a html bulleted list
 	best_rated_str = html_format_best_rated (best_rated)
@@ -214,11 +452,32 @@ def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True
 
 	# generate the index page including style sheet
 	ret = generate_home_page (dbname, conf, tpls, cats_str, best_rated_str)
-	if ret == False:
+	if ret is False:
 		return False
 
+	# generate the rss feed
+	ret = generate_rss_feed (dbname, conf)
+	if ret is False:
+		return False
+
+	# generate the category directories and indices
 	ret = generate_category_indices (dbname, conf, tpls, cats_str, best_rated_str, cats)
-	if ret == False:
+	if ret is False:
+		return False
+
+	# generate the article pages
+	ret = generate_article_pages (dbname, conf, tpls, cats_str, best_rated_str)
+	if ret is False:
+		return False
+
+	# copy other files/folders into the destination path
+	ret = copy_files_folders (conf, files_to_copy, folders_to_copy)
+	if ret is False:
+		return False
+
+	# now generate the search index database
+	ret = generate_search_index (dbname, conf, search_type_full)
+	if ret is False:
 		return False
 
 	# finally when all is successfully done return true