#!./exu
-- Documentation Search
-- Searches for the best articles based on the keywords that the user
-- is looking for. The articles are given a score and presented
-- sorted by score. The scoring system strongly favors articles that
-- contain several keywords, rather than just several occurrences of
-- one keyword. Some very common words are ignored (see noise_words).
-- e.g.
-- guru sequence atom
-- Hints: enter an important word twice to double the value of that word
without type_check
include file.e
include get.e
include misc.e
include wildcard.e
include graphics.e
include sort.e
constant TRUE = 1, FALSE = 0
constant OUT_CHUNK_SIZE = 10 -- number of entries to output per table
constant M_REF = 1, M_LIB = 2, M_OTH = 3
constant separator_line = repeat('-', 7)
constant EOF = -1
constant LINE_WIDTH = 250
constant TO_LOWER = 'a' - 'A'
constant MAX_CHUNKS = 20 -- maximum number of chunks to display
-- desired size for a chunk of text:
constant MIN_CHUNK_SIZE = 15, -- minimum number of lines
MAX_CHUNK_SIZE = 25 -- maximum number of lines
constant LEFT_HIGHLIGHT = {998}, -- highlight markers for matched words
RIGHT_HIGHLIGHT = {999},
CH_GREATER_THAN = '>',
CH_LESS_THAN = '<'
type boolean(integer x)
return x = 0 or x = 1
end type
sequence keywords -- keywords entered by user
sequence manuals -- manuals selected by user
manuals = repeat(FALSE, 3)
integer nhits, totalCount
nhits = 0
totalCount = 0
integer SLASH
SLASH='/'
sequence char_class
-- 0 means not legitimate
-- 1 means legitimate char
-- > 1 means possible first char of matching word
char_class = repeat(0, 255)
char_class['A'..'Z'] = 1
char_class['a'..'z'] = 1
char_class['0'..'9'] = 1
char_class['_'] = 1
sequence word_list
constant OTHER_DOC = {"bind.doc", "c.doc", "database.doc", "ed.doc",
"install.doc", "overview.doc", "perform.doc",
"platform.doc", "relnotes.doc", "trouble.doc",
"what2do.doc", "e2c.doc", "license.doc", "register.doc"}
constant REF_DOC = {"refman_0.doc", "refman_1.doc", "refman_2.doc",
"refman_3.doc"}
constant LIB_DOC = {"lib_0.doc", "lib_a_b.doc", "lib_c_d.doc",
"lib_e_g.doc", "lib_h_o.doc", "lib_p_r.doc",
"lib_s_t.doc", "lib_u_z.doc"}
procedure html_puts(object text)
puts(1, text)
end procedure
procedure html_printf(sequence format, object values)
printf(1, format, values)
end procedure
procedure errMessage(sequence msg)
-- issue a fatal error message and quit
puts(1,
"<p><font face=\"verdana, arial, geneva\" size=-1 color=\"#333333\">")
html_printf("%s </font>\n</body></html>\n", {msg})
abort(1)
end procedure
function fast_lower(sequence s)
-- Faster than the standard lower().
-- Speed of lower() is important here.
integer c
for i = 1 to length(s) do
c = s[i]
if c <= 'Z' then
if c >= 'A' then
s[i] = c + TO_LOWER
end if
end if
end for
return s
end function
function clean(sequence line)
-- replace any funny control characters
-- and put in \n's to help break up long lines
sequence new_line
integer c, col
new_line = ""
col = 1
for i = 1 to length(line) do
if col > LINE_WIDTH then
new_line = append(new_line, '\n')
col = 1
end if
c = line[i]
col += 1
if c = LEFT_HIGHLIGHT[1] then
new_line = new_line & "<B>"
elsif c = RIGHT_HIGHLIGHT[1] then
new_line = new_line & "</B>"
elsif c = CH_GREATER_THAN then
new_line = new_line & ">"
elsif c = CH_LESS_THAN then
new_line = new_line & "<"
elsif c < 14 then
if c = '\n' then
col = 1
elsif c = '\r' then
c = ' '
elsif c != '\t' then
c = '.'
end if
new_line = append(new_line, c)
else
new_line = append(new_line, c)
end if
end for
return new_line
end function
function sum(sequence s)
-- sum of a sequence
atom sum
sum = 0
for i = 1 to length(s) do
sum += s[i]
end for
return sum
end function
function has_punctuation(sequence word)
-- TRUE if word contains any punctuation characters
integer c
for i = 1 to length(word) do
c = word[i]
if char_class[c] = 0 and c != '?' and c != '*' then
return TRUE
end if
end for
return FALSE
end function
function rename(sequence name)
integer i
if match("lib_0.doc", name) then
name = "library.htm"
elsif match("refman_0.doc", name) then
name = "refman.htm"
else
i = find('/', name)
if i > 0 then
name = name[(i + 1).. length(name)]
end if
i = find('.', name)
if i = 0 then
errMessage("document file does not have '.'")
else
name = name[1..(i - 1)] & ".htm"
end if
end if
return name
end function
sequence chunk_list
chunk_list = {{-1, {}, {}}}
constant SCORE = 1
constant FILE = 2
constant TEXT = 3
procedure print_chunk_list()
-- print the best chunks found
sequence chunk, line
if length(chunk_list) = 1 then
html_puts("\n<font face=\"verdana, arial, geneva\" size=-1>")
html_puts("No match found. Try again.</font>\n")
else
for i = 1 to length(chunk_list) - 1 do
html_puts("<table border=0 cellpadding=0 cellspacing=0 width=\"87%\">")
html_puts("\n<tr><td width=\"100%\"></td></tr>\n")
html_puts("<tr><td bgcolor=\"#606060\"><font face=Arial size=1 ")
html_puts("color=\"FFFFFF\"> ")
chunk_list[i][FILE] = rename(chunk_list[i][FILE])
html_printf("\n<b># %d of %d ------ <a class=\"docsrch\" href=\"%s\">%s</a> --- ",
{i, length(chunk_list)-1,
chunk_list[i][FILE], chunk_list[i][FILE]})
html_printf("score: %d ------</b></font></td></tr>\n",
{100 * chunk_list[i][SCORE] + 0.5})
if remainder(i, 2) = 0 then
html_puts("<tr><td bgcolor=\"FFCCFF\">\n<pre>")
else
html_puts("<tr><td bgcolor=\"CCCCFF\">\n<pre>")
end if
chunk = chunk_list[i][TEXT]
for j = 1 to length(chunk) do
line = clean(chunk[j])
html_puts(line)
end for
html_puts("</pre></td></tr></table>\n")
end for
end if
html_puts("\n<p> <br>\n")
html_puts("<font face=\"verdana, arial, geneva\" size=2 color=\"#333333\">")
html_puts("<center>End of Search Results</center></font>\n")
html_puts("\n</body></html>\n")
end procedure
procedure save_chunk(sequence file_name, sequence chunk, atom score)
-- record an interesting chunk on the chunk list
score /= 10 + sqrt(length(chunk)) -- reduce slightly for larger chunks
for i = 1 to length(chunk_list) do
if score > chunk_list[i][SCORE] then
-- insert chunk into list at proper position
chunk_list = append(chunk_list[1..i-1], {score, file_name, chunk})
& chunk_list[i..length(chunk_list)]
if length(chunk_list) > MAX_CHUNKS+1 then
-- drop the worst chunk on the list
chunk_list = chunk_list[1..length(chunk_list)-1]
end if
exit
end if
end for
end procedure
sequence LETTER
LETTER = repeat(FALSE, 256)
for c = 0 to 255 do
if (c >= '0' and c <= '9') or
(c >= 'a' and c <= 'z') or
(c >= 'A' and c <= 'Z') then
LETTER[c+1] = TRUE
end if
end for
procedure scan(sequence file_name)
-- read next file
integer fileNum, first_hit, last_hit, new_chunk
sequence chunk, word_value, word, low_line
atom chunk_total, line_total
boolean words_on_line, white_before, white_after, isTitle
integer p, line_next, score, len
object line
fileNum = open(file_name, "rb")
if fileNum = -1 then
return
end if
new_chunk = TRUE
while TRUE do
-- initialize
if new_chunk then
chunk = {}
chunk_total = 0
first_hit = 0
last_hit = 0
new_chunk = FALSE
word_value = repeat(1, length(word_list))
end if
line_next = 1
line_total = 0
-- read next line
line = gets(fileNum)
if atom(line) then
exit -- end of file
end if
low_line = fast_lower(line)
words_on_line = find(1, low_line > ' ')
isTitle = FALSE
for i = 1 to length(word_list) do
-- find all matches of this word on line
word = fast_lower(word_list[i])
len = length(word)
line_next = 1
while TRUE do
p = match(word, low_line[line_next..length(line)])
if p = 0 then
exit
end if
line_next = line_next + p - 1
if line_next = 1 then
white_before = TRUE
else
white_before = not LETTER[1+line[line_next-1]]
end if
if (line_next + len > length(line)) or
not LETTER[1+line[line_next+len]] then
white_after = TRUE
else
white_after = FALSE
end if
if white_before and white_after then
score = 10 -- full credit: complete word match
elsif len > 3 then
if white_before or white_after then
score = 8 -- prefix or suffix
else
score = 7 -- middle of a substring
end if
else
score = 0 -- short words must match complete words
end if
-- score a bit higher for matching library heading
isTitle = match(separator_line, line) > 0
line_total += word_value[i] * score * (1 + .5 * isTitle)
word_value[i] /= 2
if score then
-- highlight the match
line = line[1..line_next-1] &
LEFT_HIGHLIGHT &
line[line_next..line_next+len-1] &
RIGHT_HIGHLIGHT &
line[line_next+length(word)..length(line)]
low_line = fast_lower(line)
line_next += length(LEFT_HIGHLIGHT) +
length(RIGHT_HIGHLIGHT)
end if
line_next += length(word)
end while
end for
chunk = append(chunk, line)
-- decide chunk boundaries
if words_on_line then
if line_total > 0 then
chunk_total += line_total
last_hit = length(chunk)
if first_hit = 0 then
first_hit = last_hit
end if
end if
if chunk_total > 0 then
if (line_total = 0 and
last_hit < length(chunk) - MIN_CHUNK_SIZE/2 and
length(chunk) >= MIN_CHUNK_SIZE) or
length(chunk) >= MAX_CHUNK_SIZE then
-- trim off some context, but not all
first_hit = floor((first_hit + 1) / 2)
last_hit = floor((last_hit + length(chunk)) / 2)
save_chunk(file_name,
chunk[first_hit..last_hit],
chunk_total)
new_chunk = TRUE
end if
elsif length(chunk) >= MIN_CHUNK_SIZE then
new_chunk = TRUE
end if
elsif chunk_total = 0 and length(chunk) > MIN_CHUNK_SIZE/2 then
new_chunk = TRUE
end if
end while
if chunk_total > 0 then
save_chunk(file_name, chunk, chunk_total)
end if
close(fileNum)
return
end procedure
function look_at(sequence path_name, sequence entry)
-- see if a file name qualifies for searching
sequence file_name
if find('d', entry[D_ATTRIBUTES]) then
return 0 -- a directory
end if
file_name = entry[D_NAME]
if (manuals[M_REF] = FALSE and find(file_name, REF_DOC) > 0) or
(manuals[M_LIB] = FALSE and find(file_name, LIB_DOC) > 0) or
(manuals[M_OTH] = FALSE and find(file_name, OTHER_DOC) > 0) then
return 0
end if
path_name &= SLASH
if equal(path_name[1..2], '.' & SLASH) then
path_name = path_name[3..length(path_name)]
end if
path_name &= file_name
scan(path_name)
return 0
end function
function blank_delim(sequence s)
-- break up a blank-delimited string
sequence list, segment
integer i
list = {}
i = 1
while i < length(s) do
while find(s[i], " \t") do
i += 1
end while
if s[i] = '\n' then
exit
end if
segment = ""
while not find(s[i], " \t\n") do
segment = segment & s[i]
i += 1
end while
list = append(list, segment)
end while
return list
end function
procedure search()
-- top level search
word_list = keywords
if walk_dir("docs", routine_id("look_at"), TRUE) then
end if
end procedure
procedure top_link(sequence percent, sequence name, sequence url, sequence w)
-- HTML for one top link
html_puts("<td bgcolor=\"#FFCC66\" width=\"" & percent &
"%\" align=center height=15 valign=bottom>\n")
html_puts("<font face=\"verdana, arial, geneva\" size=-2>\n")
html_puts("<a class=\"toplink\" href=\"" & url & "\"><b>" & name &
"</b></a></font></td>\n")
html_puts("<td bgcolor=\"#FFCC66\" width=\"1%\" valign=top align=right>\n")
html_puts("<img src=\"topcr.gif\" width=5 height=5></td>\n")
if length(w) > 0 then
html_puts("<td bgcolor=\"#FFFFFF\" width=" & w &
"><img src=\"dum.gif\" width=1 height=1></td>\n")
end if
end procedure
procedure top_links()
-- display the top links
html_puts("<table border=0 cellpadding=0 cellspacing=0 width=\"100%\">\n")
html_puts("<tr valign=top>\n")
top_link("7", "Home", "index.html", "1")
top_link("16", "What Is Euphoria?", "hotnew.htm", "1")
top_link("14", "Documentation", "manual.htm", "1")
top_link("7", "News", "news.htm", "1")
top_link("14", "The Mailing List", "listserv.htm", "1")
top_link("17", "Download Euphoria", "v20.htm", "1")
top_link("18", "Instant Registration!", "reg.htm", "")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td colspan=2 bgcolor=\"#FFCC66\" height=3>\n")
html_puts("<img src=\"dum.gif\" width=1 height=1></td>\n")
for i = 1 to 6 do
html_puts("<td bgcolor=\"#FFFFFF\"><img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("<td colspan=2 bgcolor=\"#FFCC66\">\n")
html_puts("<img src=\"dum.gif\" width=1 height=1></td>\n")
end for
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td colspan=20 bgcolor=\"#FFFFFF\" height=3>\n")
html_puts("<img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("</tr>\n")
html_puts("</table>\n")
html_puts("<table border=0 cellpadding=0 cellspacing=0 width=\"100%\">\n")
html_puts("<tr valign=top>\n")
top_link("23", "Recent User Contributions", "contrib.htm", "1")
top_link("12", "The Archive", "archive.htm", "1")
top_link("22", "Other Euphoria Web Sites", "othersit.htm", "1")
top_link("16", "RDS Development", "contract.htm", "1")
top_link("22", "Related Books & Software", "books.htm", "")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td colspan=2 bgcolor=\"#FFCC66\" height=3>\n")
for i = 1 to 4 do
html_puts("<img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("<td bgcolor=\"#FFFFFF\"><img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("<td colspan=2 bgcolor=\"#FFCC66\">\n")
end for
html_puts("<img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("</tr>\n")
html_puts("</table>\n")
end procedure
procedure htmlHeader1()
-- First batch of HTML
html_puts("Content-type: text/html\n\n")
html_puts("<html><head>\n")
html_puts("<title>search results for Euphoria File Archive</title>\n")
html_puts("<base href=\"http://www.RapidEuphoria.com/\">\n")
html_puts(
"<link REL=StyleSheet HREF=\"global.css\" TYPE=\"text/css\" MEDIA=screen>\n")
html_puts("</head>\n")
html_puts("<body bgcolor=\"#FFFFFF\" link=\"#003366\"" &
" vlink=\"#006699\" text=\"#000000\">\n")
html_puts("<basefont size=3>\n\n")
top_links()
html_puts("<table border=0 cellpadding=0 cellspacing=0 width=\"100%\">\n")
html_puts("<tr valign=top>\n")
html_puts("<td width=\"95%\"></td>\n")
html_puts("<td></td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts(
"<td colspan=2 height=7><img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td colspan=2 bgcolor=\"#CCCC99\"><img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts(
"<td colspan=2 height=7><img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td align=center>\n")
html_puts("<font face=\"Arial, Helvetica\" color=\"#CC3366\" size=+2><b>\n")
html_puts("Search Results</b></font></td>\n")
html_puts("<td></td>\n")
html_puts("</tr>\n")
html_puts("</table>\n\n")
end procedure
procedure htmlHeader2()
-- second batch of HTML
html_puts("<p>\n")
html_puts("<form method=GET action=\"cgi-bin/guru.exu\">\n\n")
html_puts("<table border=1 cellpadding=0 cellspacing=0 width=\"100%\">\n")
html_puts("<tr bgcolor=\"#FFFFEE\">\n")
html_puts("<td>\n")
html_puts("<table border=0 cellpadding=0 cellspacing=0 width=\"100%\">\n")
html_puts("<tr>\n")
html_puts("<td width=\"6%\"></td>\n")
html_puts("<td width=\"22%\"></td>\n")
html_puts("<td width=\"50%\"></td>\n")
html_puts("<td width=\"22%\"></td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts(
"<td colspan=4 height=7><img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td></td>\n")
html_puts(
"<td rowspan=3><font face=\"Arial, Helvetica\" size=3 color=\"#990033\">\n")
html_puts("<b>Search Again:</b></font>\n")
html_puts("</td>\n")
html_puts("<td>\n")
html_puts("<font face=\"Arial, Helvetica\" size=-2>\n")
html_puts("<input type=\"CHECKBOX\" name=\"refDoc\"")
if manuals[M_REF] then
html_puts("CHECKED")
end if
html_puts("><font color=\"#8080FF\"><b>Reference Manual</b> </font>\n")
html_puts("<input type=\"CHECKBOX\" name=\"libDoc\"")
if manuals[M_LIB] then
html_puts("CHECKED")
end if
html_puts("><font color=\"#8080FF\"><b>Library Routines</b> </font>\n")
html_puts("<input type=\"CHECKBOX\" name=\"otherDoc\"")
if manuals[M_OTH] then
html_puts("CHECKED")
end if
html_puts("><font color=\"#8080FF\"><b>Other Documents</b> </font>\n")
html_puts("</font>\n")
html_puts("<td align=center><font face=\"Comic Sans MS\" size=-2><i>\n")
html_puts("<a href=\"asearch.txt\">Powered by Euphoria</a></i></font>\n")
html_puts("</td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td height=3><img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("<td colspan=2><img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td></td>\n")
html_puts("<td>\n")
html_puts("<input type=\"text\" name=\"keywords\" size=45 \n")
html_puts("value=\"")
for i = 1 to length(keywords) do
for j = 1 to length(keywords[i]) do
if keywords[i][j] = '"' then
html_puts(""")
else
html_puts(keywords[i][j])
end if
end for
if i != length(keywords) then
html_puts(" ") --"+")
end if
end for
html_puts("\">\n")
html_puts("</td>\n")
html_puts("<td>\n")
html_puts("<input type=\"submit\" value=\"Search!\"></td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts(
"<td colspan=4 height=3><img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts("<td colspan=2></td>\n")
html_puts("<td colspan=2>\n")
html_puts(
"<font face=\"verdana, arial, geneva\" size=-2 color=\"#333333\">\n")
html_puts("Type one or more words.</font></td>\n")
html_puts("</tr>\n")
html_puts("<tr>\n")
html_puts(
"<td colspan=4 height=7><img src=\"dum.gif\" width=1 height=1></td>\n")
html_puts("</tr>\n")
html_puts("</table>\n")
html_puts("</td>\n")
html_puts("</tr>\n")
html_puts("</table>\n")
html_puts("</form>\n")
html_puts("<p>\n")
end procedure
function hex_char(integer c)
-- is c a valid hex character?
return find(c, "0123456789ABCDEFabcdef")
end function
function hex_val(integer c)
-- return value of a hex character
if c >= 'A' and c <= 'F' then
return 10 + c - 'A'
elsif c >= 'a' and c <= 'f' then
return 10 + c - 'a'
else
return c - '0'
end if
end function
function parse_input(sequence s)
-- crack the syntax sent from Web browser: aaa=bbb&ccc=ddd&...
-- Convert to {{"aaa", "bbb"}, {"ccc", "ddd"}, ...} left-right pairs
integer i, c
sequence word_pairs, left_word, right_word
word_pairs = {}
i = 1
s &= {0,0,0} -- end markers
while s[i] != 0 do
left_word = ""
while 1 do
-- build left word
c = s[i]
if c = '=' or c = '&' or c = 0 then
exit
end if
if c = '%' and hex_char(s[i+1]) and hex_char(s[i+2]) then
c = 16 * hex_val(s[i+1]) + hex_val(s[i+2])
i += 2
elsif c = '+' then
c = ' '
end if
left_word &= c
i += 1
end while
i += 1
right_word = ""
while 1 do
-- build right word
c = s[i]
if c = '&' or c = 0 then
exit
end if
if c = '%' and hex_char(s[i+1]) and hex_char(s[i+2]) then
c = 16 * hex_val(s[i+1]) + hex_val(s[i+2])
i += 2
elsif c = '+' then
c = ' '
end if
right_word &= c
i += 1
end while
i += 1
word_pairs = append(word_pairs, {left_word, right_word})
end while
return word_pairs
end function
function getKeywords()
-- get values from the CGI query string, e.g.
-- dos=on&keywords=apple+orange
sequence keystring
sequence key, pairs, var, val
object query
query = getenv("QUERY_STRING")
if atom(query) then
query = getenv("query_string")
if atom(query) then
errMessage("Internal Error - no query_string")
end if
end if
pairs = parse_input(query)
keystring = ""
for i = 1 to length(pairs) do
var = lower(pairs[i][1])
val = pairs[i][2]
if equal(var, "refdoc") then
manuals[M_REF] = TRUE
elsif equal(var, "libdoc") then
manuals[M_LIB] = TRUE
elsif equal(var, "otherdoc") then
manuals[M_OTH] = TRUE
elsif equal(var, "keywords") then
keystring = val
end if
end for
keywords = {}
if length(keystring) = 0 then
return "Enter one or more words for searching. Try again."
end if
-- make list of keywords from keystring
key = ""
keystring &= ' '
for i = 1 to length(keystring) do
if keystring[i] = ' ' then
if length(key) then
keywords = append(keywords, key)
key = ""
end if
elsif keystring[i] != '"' then
key = append(key, keystring[i])
end if
end for
if length(keywords) = 0 then
return "Type one or more keywords for search. Try again."
end if
return ""
end function
sequence msg
htmlHeader1()
msg = getKeywords()
htmlHeader2()
if length(msg) > 0 then
errMessage(msg)
end if
flush(1)
search()
print_chunk_list()