/usr/share/dwww/swish++.conf

###
## dwww's swish++.conf file
## $Id: swish++.conf,v 1.7 2007-01-06 17:05:11 robert Exp $
###
AssociateMeta		yes
#
# used by: index; when "no", same as the -A option.
#
#	Associate words with meta names during indexing.

#ExcludeClass		no_index
#
# used by: index; same as the -C option.
#
#	For HTML and XHTML files only (as specified by IncludeFile), a set of
#	class names of HTML and XHTML elements whose content text is not to be
#	indexed.

ExcludeFile		*.gif *.jpg *.png *.tar*
#
# used by: index, extract; same as the -E option.
#
#	A set of filename patterns of files not to index or extract.  Case is
#	significant.  You normally specify either IncludeFile or ExtractFile
#	-OR- ExcludeFile (whichever is easier to specify), but not both.


#ExtractExtension	txt
#
# used by extract; same as the -x option.
#
#	The extension to append to filenames during extraction.

#ExtractFile		*.doc *.ppt *.xls
#
# used by extract; same as the -e option.
#
#	A set of filename patterns of files to extract.  Case is significant.
#	Filename patterns specified here MUST NOT also be specified in
#	ExcludeFile below.
#
#	You should modify the set to include only those that you are actually
#	using for increased performance.

#ExtractFilter		no
#
# used by extract; when "yes", same as the -f option.
#
#	When "yes", extract a single file to standard output.

#FilesGrow		100
#
# used by: index; same as the -g option.
#
#	The number of files to grow reserved space for when incrementally
#	indexing.  The number may be specified as either an absolute number or
#	a percentage (when a trailing % is present).

#FilesReserve		1000
#
# used by: index; same as the -F option.
#
#	The initial number of files to reserve space for.  During indexing,
#	this can be exceeded without any problem, but there will be a slight
#	performance penalty.
#
#	If you know approximately how many files you have, modify the above
#	value!

FilterFile *.bz2	bunzip2 -c %f > @/var/cache/dwww/swish.tmp.%B
FilterFile *.gz		gunzip -c %f > @/var/cache/dwww/swish.tmp.%B
FilterFile *.Z		uncompress -c %f > @/var/cache/dwww/swish.tmp.%B
#FilterFile *.doc	antiword %f @/var/cache/dwww/swish.tmp.%B.txt
#FilterFile *.pdf	pdftotext %f @/var/cache/dwww/swish.tmp.%B.txt
#FilterFile *.ps	pstotext %f > @/var/cache/dwww/swish.tmp.%B.txt

#
# used by: index, extract; no option equivalent.
#
#	Filter files having certain extensions prior to either indexing or
#	extraction.
#
#	See http://www.wvware.com/ for information about the wvText program.
#	For Debian: antiword is doing a very good job
#	See http://www.research.compaq.com/SRC/virtualpaper/pstotext.html for
#	information about the pstotext program.

#FollowLinks		no
#
# used by: index, extract; same as the -l opt ion.
#
#	Follow symbolic links during indexing or extraction.

IncludeFile text	*.txt *.text 
IncludeFile HTML	*.asp *.*htm* *.jsp
#IncludeFile ID3	*.mp3
IncludeFile LaTeX	*.tex
#IncludeFile Mail	*.m
IncludeFile Man		*.[1-9n] *.[1-9][a-z]
IncludeFile RTF		*.rtf

#
# used by: index; same as the -e option.
#
#	A set of filename patterns of files to index and the modules they map
#	to.  Case is irrelevant for the module name but significant for the
#	patterns.  Filename patterns specified here MUST NOT also be specified
#	in ExcludeFile.
#
#	You should modify the set to include only those that you are actually
#	using for increased performance.


#IndexFile		/var/cache/dwww/dwww.swish++.index
### This option is set by dwww via commandline switch
#
# used by: index, search; same as the -i option.
#
#	The name of the index file either generated or searched.

RecurseSubdirs		no
#
# used by: index, extract; when "no", same as the -r option.
#
#	When "no", do not recursively index the files in subdirectories, that
#	is when a directory is encountered, all the files in that directory are
#	indexed (modulo the filename patterns specified via the IncludeFile,
#	ExcludeFile, or ExtractFile variables), but subdirectories encountered
#	are ignored and therefore the files contained in them are not indexed.
#	(This variable is most useful when specifying the directories and files
#	via standard input.)  The default is to index the files in
#	subdirectories recursively.

ResultsMax		100
#
# used by: search; same as the -m option.
#
#	The maximum number of results to return overriding the compiled-in
#	default (which is usually 100).

ResultSeparator	"__--__"
#
# used by: search; same as the -R option
#
#	The string to separate the parts in a search result when ResultsFormat
#	is "classic".  Either single or double quotes can be used to preserve
#	whitespace.  Quotes are stripped only if they match.

ResultsFormat		classic
#
# used by: search; same as the -F option
#
#	The output format of search results: either "classic" or "XML".

SearchBackground	no
#
# used by: search; when "no", same as the -B option.
#
#	When "yes" and SearchDaemon is not "none", automatically detach from
#	the terminal and run in the background.


#StemWords		no
#
# used by: search; when "yes", same as the -s option.
#
#	Perform stemming (suffix stripping) on words during searches.  Words
#	that end in the wildcard character are not stemmed.

#StopWordFile		custom_stop_word_file
#
# used by: index, extract; same as the -s option.
#
#	The name of a file containing the set of stop-words to use instead of
#	the built-in set.

StoreWordPositions	no
#
# used by: index; when "no", same as the -P option.
#
#	Store word positions during indexing needed to do "near" searches.
#	Storing said data approximately doubles the size of the generated
#	index.

TempDirectory		/var/cache/dwww
#
# used by: index
#
#	Directory to use for temporary files during indexing.  If your OS
#	mounts swap space on /tmp, as indexing progresses and more files get
#	created in /tmp, you will have less swap space, indexing will get
#	slower, and you may run out of memory.  If this is the case, you can
#	specify a directory on a real filesystem, i.e., one on a physical
#	disk.  The directory must exist.

#ThreadsMin		5
#ThreadsMax		100
#
# used by: search; same as the -t or -T option, respectively.
#
#	The minimum/maximum number of simultanous threads, respectively; used
#	only when SearchDaemon is not "none".

#ThreadTimeout		30
#
# used by: search; same as the -O option.
#
#	Number of seconds until an idle spare thread times out and destroys
#	itself; used only when SearchDaemon is not "none".

#TitleLines		12
#
# used by: index; same as the -t option.
#
#	For HTML and XHTML files only, the maximum number of lines into a file
#	to look at for HTML and XHTML <TITLE> tags.  The default is 12.  Larger
#	numbers slow indexing.

#Verbosity		0
#
# used by: index, extract; same as the -v option.
#
#	Print additional information to standard output during indexing or
#	extraction.  The verbosity levels are 0-4; see index(1) or extract(1)
#	for details.

# WordFilesMax		10000
#
# used by: index, search; same as the -f option.
#
#	The maximum number of files a word may occur in before it is discarded
#	as being too frequent.  The default is infinity.

WordPercentMax		101
#
# used by: index, search; same as the -p option.
#
#	The maximum percentage of files a word may occur in before it is
#	discarded as being too frequent.  If you want to keep all words
#	regardless, specify 101.

WordThreshold		50000
#
# used by: index; same as the -W option.
#
#	The word count past which partial indicies are generated and merged
#	since all the words are too big to fit into memory at the same time.
#	If you index and your machine begins to swap like mad, lower this
#	value.  The above works OK in a 64MB machine.  A rule of thumb is to
#	add 250000 words for each additional 64MB of RAM you have.  These
#	numbers are for a SPARC machine running Solaris.  Other machines
#	running other operating systems use memory differently.  You simply
#	have to experiment.  Only the super-user can specify a value larger
#	than the compiled-in default.

# the end
dwww 1.11.7 / usr / share / dwww / swish++.conf