Implemented the search database exporting
[biaweb_qt.git] / biaweb_exporter.py
1 # BiaWeb Website content manager (c) 2010 V.Harishankar
2 # Site exporter/generator class
3
4 import os
5 import os.path
6 import sys
7 import time
8 import sqlite3
9 import string
10 import shutil
11 import HTMLParser
12 import cPickle
13 import biaweb_db
14 import biaweb_strings
15
16 # class to remove HTML tags from a string - used by the search
17 # generator to get an article without HTML tags
18 class HTMLTagRemover (HTMLParser.HTMLParser):
19 def __init__ (self):
20 HTMLParser.HTMLParser.__init__ (self)
21 # initialize the list for data - this will be converted
22 # to a string before returning
23 self.data_list = []
24
25 # This event gets the data of the string that is anything that
26 # is not a HTML tag
27 def handle_data (self, data):
28 self.data_list.append (data)
29
30 # return the string thus collected by the handle_data event
31 def get_raw_string (self):
32 raw_string = "".join (self.data_list)
33 return raw_string
34
35 # to format the best rated articles in a HTML link list
36 def html_format_best_rated (best_rated):
37 items = [ "<ul>\n", ]
38 for art in best_rated:
39 # art[13] is category stub, art[8] is article stub
40 # thus forming the relative url as Category/Article.html
41 str_art = '<li><a href="' + art[13] + '/' + art[8] + '.html">' + art[1] + '</a></li>\n'
42 items.append (str_art)
43 items.append ("</ul>\n")
44 str_items = "".join (items)
45 return str_items
46
47 # to format categories in a HTML link list
48 def html_format_categories (cats):
49 items = [ "<ul>\n", ]
50 for cat in cats:
51 # cat[3] is category stub and cat[1] is category name
52 str_cat = '<li><a href="' + cat[3] + '/">' + cat[1] + '</a></li>\n'
53 items.append (str_cat)
54 items.append ("</ul>\n")
55 str_items = "".join (items)
56 return str_items
57
58 # to convert a rating number into rating images out of 10 stars
59 def html_format_rating (rating):
60 items = []
61 # if -1 then return unrated as the text
62 if rating == -1:
63 return "unrated"
64 # fill up the number of stars for the rating
65 for i in range (rating):
66 items.append ('<img src="star.gif" alt="*" />')
67 # fill up remaining slots (of 10) with grey stars
68 for i in range (10 - rating):
69 items.append ('<img src="star-grey.gif" alt="-" />')
70
71 rating_str = "".join (items)
72 return rating_str
73
74 # function to generate the search index file
75 def generate_search_index (dbname, conf, full_text_index = True):
76 # get all the articles
77 arts = biaweb_db.site_articles (dbname)
78 # if cannot get articles
79 if arts == False:
80 return False
81
82 # if full text index, then field should be article content otherwise keywords
83 if full_text_index:
84 searchfield = 4
85 else:
86 searchfield = 3
87
88 # initialize the search index dictionary
89 search_index = dict ()
90
91 # now run through the articles and generate a table of unique words (except
92 # stop words)
93 for art in arts:
94 # now strip out the HTML tags from the articles
95 parser = HTMLTagRemover ()
96 parser.feed (art[searchfield])
97 parser.close ()
98 # get the word list
99 word_list = parser.get_raw_string ().split ()
100
101 # now run through each word, make it lowercase, remove all cruft from it
102 # and add it to a dictionary
103 for word in word_list:
104 cleanword = word.strip (":;?!_<>,.+-\"'=`!@#$%^&*()[]{}/= \n\r\t").lower ()
105 # if the word is not a "stop word", then add it to the search database
106 if cleanword not in biaweb_strings.stopwords:
107 # title of the search entry should be the article title
108 title = art[1]
109 # url should be the article URL: http://siteurl/Category/Article.html
110 url = "http://" + conf[0] + art[13] + "/" + art[8] + ".html"
111 # if search index has the word (as key)
112 if search_index.has_key (cleanword):
113 # add the title and url as a tuple to the set
114 search_index[cleanword].add ((title, url))
115 # create the key as the word
116 else:
117 # create a set for the keyword. Set will hold the tuples
118 # representing article title and url
119 search_index[cleanword] = set ()
120 search_index[cleanword].add ((title, url))
121
122 # done now write the search database as a python pickle object of search_index
123 search_index_path = os.path.join (conf[5], "cgi-bin", "searchwords.idx")
124 htaccess_path = os.path.join (conf[5], "cgi-bin", ".htaccess")
125 try:
126 # open the file in write binary mode
127 fsearchindex = open (search_index_path, "wb")
128 # dump the dictionary as a pickle object in binary mode
129 cPickle.dump (search_index, fsearchindex, 2)
130 fsearchindex.close ()
131 # write the htaccess file to prevent opening the index file from web browser
132 fhtaccess = open (htaccess_path, "w+")
133 fhtaccess.write (biaweb_strings.searchindex_htaccess)
134 fhtaccess.close ()
135 except OSError, IOError:
136 return False
137
138 # finally return true
139 return True
140
141 # function to copy additional files and folders to the destination path
142 def copy_files_folders (conf, files_to_copy, folders_to_copy):
143 # create the cgi-bin directory and try to copy search.py into the destination directory if possible
144 # otherwise user must copy it manually
145 search_script_path = os.path.join (sys.path[0], "search.py")
146 if os.path.exists (search_script_path):
147 try:
148 os.mkdir (os.path.join (conf[5], "cgi-bin"))
149 shutil.copy2 (search_script_path, os.path.join(conf[5], "cgi-bin"))
150 except IOError, OSError:
151 return False
152
153 # try to copy the star rating images to destination directory if possible
154 # otherwise user must copy it manually
155 rating_img_star = os.path.join (sys.path[0], "star.gif")
156 rating_img_greystar = os.path.join (sys.path[0], "star-grey.gif")
157 if os.path.exists (rating_img_star):
158 try:
159 shutil.copy2 (rating_img_star, conf[5])
160 except IOError, OSError:
161 return False
162 if os.path.exists (rating_img_greystar):
163 try:
164 shutil.copy2 (rating_img_greystar, conf[5])
165 except IOError, OSError:
166 return False
167
168 # additional files to copy
169
170 # first copy files
171 # check if files to copy is not empty
172 if files_to_copy <> []:
173 for src, dest in files_to_copy:
174 # get full path from relative path in dest
175 full_dest = os.path.join (conf[5], dest)
176 try:
177 shutil.copy2 (src, full_dest)
178 except IOError, OSError:
179 return False
180
181 # additional folders to copy
182
183 # now copy the folders
184 if folders_to_copy <> []:
185 for src, dest in folders_to_copy:
186 # get full path from relative path in dest
187 full_dest = os.path.join (conf[5], dest)
188 try:
189 shutil.copytree (src, full_dest)
190 except IOError, OSError:
191 return False
192
193 # finally return true
194 return True
195
196 # function to generate article pages
197 def generate_article_pages (dbname, conf, templates, category_str, bestrated_str):
198 # main template
199 tpl_main = string.Template (templates[0][1])
200 # article template
201 tpl_articlebit = string.Template (templates[1][1])
202
203 # get all articles from the database
204 articles = biaweb_db.site_articles (dbname)
205 if articles == False:
206 return
207
208 # walk through each article and generate the file in the appropriate category
209 # folder
210 for art in articles:
211 art_cdate = time.ctime (art[5])
212 art_mdate = time.ctime (art[6])
213 rating_str = html_format_rating (art[9])
214 # now build the article from the article bit template
215 article_str = tpl_articlebit.safe_substitute (article_title = art[1],
216 article_cdate = art_cdate,
217 article_mdate = art_mdate,
218 rating = rating_str,
219 article_contents = art[4])
220
221 # now build the article page
222 articlepage_str = tpl_main.safe_substitute (site_title = conf[1],
223 site_url = "http://" + conf[0],
224 meta_keywords = art[3],
225 meta_description = art[2],
226 page_title = conf[1],
227 page_desc = conf[3],
228 contents_bit = article_str,
229 list_of_categories = category_str,
230 list_best_rated = bestrated_str,
231 copyright = conf[6])
232 # write to the article file
233 try:
234 farticle = open (os.path.join (conf[5], art[13], art[8] + ".html"), "w+")
235 farticle.write (articlepage_str)
236 except OSError, IOError:
237 return False
238
239 # finally return true
240 return True
241
242 # function to generate category directories and indices
243 def generate_category_indices (dbname, conf, templates, category_str, bestrated_str, category_list):
244 # main template
245 tpl_main = string.Template (templates[0][1])
246 # table bit
247 tpl_tablebit = string.Template (templates[3][1])
248 # table row bit
249 tpl_trowbit = string.Template (templates[4][1])
250
251 # run through each category and generate category index page
252 for cat in category_list:
253 try:
254 # create the category directory
255 os.mkdir (os.path.join (conf[5], cat[3]))
256 except IOError, OSError:
257 return False
258
259 # now get the list of articles for the specified category
260 articles_list = biaweb_db.site_articles (dbname, cat[0])
261 if articles_list == False:
262 return False
263
264 tableitems = []
265 # run through the list of articles in category
266 for art in articles_list:
267 url = art[13] + "/" + art[8] + ".html"
268 creattime = time.ctime (art[5])
269 rating_str = html_format_rating (art[9])
270 # now build the table rows for each article
271 tableitem_str = tpl_trowbit.safe_substitute (article_url = url,
272 title = art[1],
273 created = creattime,
274 rating = rating_str)
275 tableitems.append (tableitem_str)
276 # generate the rows as a string
277 tablerows_str = "".join (tableitems)
278
279 # now create the page template
280 table_str = tpl_tablebit.safe_substitute (category_title = cat[1],
281 category_desc = cat[2],
282 table_rows = tablerows_str)
283
284 # now create the index page
285 categoryindex_str = tpl_main.safe_substitute (site_title = conf[1],
286 site_url = "http://" + conf[0],
287 meta_keywords = conf[2],
288 meta_description = cat[2],
289 page_title = conf[1],
290 page_desc = conf[3],
291 contents_bit = table_str,
292 list_of_categories = category_str,
293 list_best_rated = bestrated_str,
294 copyright = conf[6])
295
296 # now write to Category/index.html
297 try:
298 fcatindex = open (os.path.join (conf[5], cat[3], "index.html"), "w+")
299 fcatindex.write (categoryindex_str)
300 fcatindex.close ()
301 except OSError, IOError:
302 return False
303
304 # finally return true
305 return True
306
307 # function to generate main index file and stylesheet
308 def generate_home_page (dbname, conf, templates, category_str, bestrated_str):
309 # main template
310 tpl_main = string.Template (templates[0][1])
311 # index bit
312 tpl_indexbit = string.Template (templates[6][1])
313 # news bits
314 tpl_newsbit = string.Template (templates[2][1])
315
316 # get the latest articles - conf[4] is num of rss entries to be used also
317 latest_arts = biaweb_db.site_latest_articles (dbname, conf[4])
318 if latest_arts == False:
319 return False
320
321 news_items = []
322
323 # Run through the latest articles
324 # for the num of latest news items on index
325 for art in latest_arts:
326 # url is Category/Article.html
327 url = art[13] + "/" + art[8] + ".html"
328 # art[5] is creation time
329 strdate = time.ctime (art[5])
330 # now populate the template variables. art[1] is title, art[2] is summary
331 strnews = tpl_newsbit.safe_substitute (news_title = art[1],
332 news_link = url,
333 news_datetime = strdate,
334 news_description = art[2]
335 )
336 news_items.append (strnews)
337 # now convert it into a string
338 newsbit_str = "".join (news_items)
339
340 # now populate the index template
341 indexbit_str = tpl_indexbit.safe_substitute (site_name = conf[1],
342 news_updates = newsbit_str
343 )
344 # now populate the main page template
345 main_str = tpl_main.safe_substitute (site_title = conf[1],
346 site_url = "http://" + conf[0],
347 meta_keywords = conf[2],
348 meta_description = conf[3],
349 page_title = conf[1],
350 page_desc = conf[3],
351 contents_bit = indexbit_str,
352 list_of_categories = category_str,
353 list_best_rated = bestrated_str,
354 copyright = conf[6])
355
356 # write the index.html file in the destination directory
357 try:
358 findex = open (os.path.join (conf[5], "index.html"), "w+")
359 findex.write (main_str)
360 findex.close ()
361 except IOError, OSError:
362 return False
363
364 # write the style.css file in the destination directory
365 try:
366 fstyle = open (os.path.join (conf[5], "style.css"), "w+")
367 fstyle.write (templates[5][1])
368 fstyle.close ()
369 except IOError, OSError:
370 return False
371
372 return True
373
374 # superfunction to generate the site
375 def generate_site (dbname, files_to_copy, folders_to_copy, search_type_full=True):
376 # get the configuration
377 conf = biaweb_db.get_configuration (dbname)
378 # get the templates
379 tpls = biaweb_db.get_templates (dbname)
380
381 # get the list of categories
382 cats = biaweb_db.get_categories (dbname)
383 # cannot get categories return false
384 if cats == False:
385 return False
386
387 # format the categories as a html bulleted list
388 cats_str = html_format_categories (cats)
389
390 # get the best rated articles
391 best_rated = biaweb_db.site_get_bestrated (dbname)
392 # if cannot retrieve
393 if best_rated == False:
394 return False
395 # format the best rated articles as a html bulleted list
396 best_rated_str = html_format_best_rated (best_rated)
397
398 # remove the destination tree and recreate it
399 try:
400 if os.path.exists (conf[5]):
401 shutil.rmtree (conf[5])
402 os.mkdir (conf[5])
403 except OSError:
404 return False
405
406 # generate the index page including style sheet
407 ret = generate_home_page (dbname, conf, tpls, cats_str, best_rated_str)
408 if ret == False:
409 return False
410
411 # generate the category directories and indices
412 ret = generate_category_indices (dbname, conf, tpls, cats_str, best_rated_str, cats)
413 if ret == False:
414 return False
415
416 # generate the article pages
417 ret = generate_article_pages (dbname, conf, tpls, cats_str, best_rated_str)
418 if ret == False:
419 return False
420
421 # copy other files/folders into the destination path
422 ret = copy_files_folders (conf, files_to_copy, folders_to_copy)
423 if ret == False:
424 return False
425
426 # now generate the search index database
427 ret = generate_search_index (dbname, conf, search_type_full)
428 if ret == False:
429 return False
430
431 # finally when all is successfully done return true
432 return True