3 # This perl script builds an sqlite search index for this site.
5 # Taken from https://github.com/gbxyz/webidx/tree/main
7 # Copyright (c) 2024, Gavin Brown
8 # Full license: https://github.com/gbxyz/webidx/blob/a28a984d38fd546d1bec4d6a4a5a47ab86cb08f8/LICENSE
10 # Modifications have been made since taking a copy of the code
11 # to suit this particular website and use-case
13 # To use, needed to install the following packages (Tested on Fedora 39):
14 # perl-open perl-HTML-Parser perl-DBD-SQLite
17 use Getopt::Long qw(:config bundling auto_version auto_help);
20 use File::Basename qw(basename);
21 use File::Glob qw(:bsd_glob);
25 use List::Util qw(uniq none any);
27 use open qw(:encoding(utf8));
30 use vars qw($VERSION);
35 # parse command line options
37 my (@exclude, @excludePattern, $compress, $origin);
38 die() unless (GetOptions('exclude|x=s' => \@exclude, 'excludePattern|xP=s' => \@excludePattern, 'compress|z' => \$compress, 'origin|o=s' => \$origin));
40 @exclude = map { abs_path($_) } @exclude;
43 # determine the source directory and the database filename
45 my $dir = abs_path(shift(@ARGV) || '.');
46 my $dbfile = abs_path(shift(@ARGV) || $dir.'/webidx.db');
49 # initialise the database
51 unlink($dbfile) if (-e $dbfile);
52 my $db = DBI->connect('dbi:SQLite:dbname='.$dbfile, '', '', {
59 # a list of words we want to exclude
61 my @common = qw(be and of a in to it i for he on do at but from that not by or as can who get if my as up so me the are we was is you this with an when want our there has);
64 # this is a map of filename => page title
69 # this is map of word => page
74 # scan the source directory
77 say 'scanning ', $dir;
82 # generate the database
85 say 'finished scan, generating index';
89 $db->do(qq{CREATE TABLE `pages` (`id` INTEGER PRIMARY KEY, `url` TEXT, `title` TEXT)});
90 $db->do(qq{CREATE TABLE `words` (`id` INTEGER PRIMARY KEY, `word` TEXT)});
91 $db->do(qq{CREATE TABLE `index` (`id` INTEGER PRIMARY KEY, `word` INT, `page_id` INT)});
93 my $word_sth = $db->prepare(qq{INSERT INTO `words` (`word`) VALUES (?)});
94 my $page_sth = $db->prepare(qq{INSERT INTO `pages` (`url`, `title`) VALUES (?, ?)});
95 my $index_sth = $db->prepare(qq{INSERT INTO `index` (`word`, `page_id`) VALUES (?, ?)});
103 foreach my $word (keys(%{$index})) {
106 # insert an entry into the words table (if one doesn't already exist)
108 if (!defined($word_ids->{$word})) {
109 $word_sth->execute($word);
110 $word_ids->{$word} = $db->last_insert_id;
116 foreach my $page (keys(%{$index->{$word}})) {
118 # clean up the page title by removing leading and trailing whitespace
120 my $title = $titles->{$page};
121 $title =~ s/^[ \s\t\r\n]+//g;
122 $title =~ s/[ \s\t\r\n]+$//g;
124 # remove any trailing "· BookStack"
126 $title =~ s/· BookStack$//;
129 # remove the directory
136 $page = $origin.$page if ($origin);
139 # Trim off the /index.html
141 $page =~ s/\/index\.html$//;
144 # insert an entry into the pages table (if one doesn't already exist)
146 if (!defined($page_ids->{$page})) {
147 $page_sth->execute($page, $title);
148 $page_ids->{$page} = $db->last_insert_id;
152 # insert an index entry
154 $index_sth->execute($word_ids->{$word}, $page_ids->{$page}) || die();
163 say 'compressing database...';
164 open2(undef, undef, qw(gzip -f -9), $dbfile);
172 # reads the contents of a directory: all HTML files are indexed, all directories
173 # are scanned recursively. symlinks to directories are *not* followed
178 foreach my $file (map { abs_path($_) } bsd_glob(sprintf('%s/*', $dir))) {
181 next if (any { $file =~ m/\Q$_/i } @excludePattern);
186 scan_directory($file);
188 } elsif ($file =~ /\.html?$/i) {
190 # HTML file, index it
204 return if (any { $_ eq $file } @exclude) || (any { $file =~ m/\Q$_/i } @excludePattern);
210 my $parser = HTML::Parser->new(
215 if ('title' eq $currtag) {
217 # <title> tag, which goes into the $titles hashref
223 # everything else, which just gets appended to the $text string
235 if ('main' eq $currtag) {
241 # add the alt attributes of images, and any title attributes found
243 $text .= " ".$_[1]->{'alt'} if (lc('img') eq $_[0]);
244 $text .= " ".$_[1]->{'title'} if (defined($_[1]->{'title'}));
253 if ('main' eq $_[0]) {
259 $parser->unbroken_text(1);
262 # we expect these elements contain text we don't want to index
264 $parser->ignore_elements(qw(script style header nav footer svg));
267 # open the file, being careful to ensure it's treated as UTF-8
269 my $fh = IO::File->new($file);
270 $fh->binmode(qq{:utf8});
275 $parser->parse_file($fh);
280 $titles->{$file} = $title;
281 my @words = grep { my $w = $_ ; none { $w eq $_ } @common } # filter out common words
282 grep { /\w/ } # filter out strings that don't contain at least one word character
284 $_ =~ s/^[^\w]+//g; # remove leading non-word characters
285 $_ =~ s/[^\w]+$//g; # remove trailing non-word characters
288 split(/[\s\r\n]+/, lc($text)); # split by whitespace
290 foreach my $word (@words) {
292 # increment the counter for this word/file
294 $index->{$word}->{$file}++;
302 webidx [-x FILE [-x FILE2 [...]]] [--xP PATTERN [--xP PATTERN2 [...]]] [-o ORIGIN] [-z] [DIRECTORY] [DBFILE]
304 This will cause all HTML files in C<DIRECTORY> to be indexed, and the resulting database written to C<DBFILE>. The supported options are:
308 =item * C<-x FILE> specifies a file to be excluded. May be specified multiple times.
310 =item * C<--xP PATTERN> specifies a pattern of folders and files to be excluded. May be specified multiple times.
312 =item * C<-o ORIGIN> specifies a base URL which will be prepended to the filenames (once C<DIRECTORY> has been removed).
314 =item C<-z> specifies that the database file should be compressed once generated. If specified, the database will be at C<DBFILE.gz>.
316 =item * C<DIRECTORY> is the directory to be indexed, defaults to the current working directory.
318 =item * C<DBFILE> is the location where the database should be written. if not specified, defaults to C<DIRECTORY/index.db>.