Version 7 (modified by tianxc, 16 years ago)

--

Swish-e

swishbuild Building Swish-e package

swish-
swishbuild-
swishbuild-again 

Important

  • Do not run swish-e as root.

swish-e configuration (spider.conf)

my %dayabay = (
    email       => 'tianxc@ihep.ac.cn',
    base_url    => 'http://dayabay.ihep.ac.cn/',
    test_url    => sub {  $_[0]->path !~ /\.(?:gif|jpeg|png|gz|root|dia)$/i },
    delay_sec   => '1',
    max_depth   => 'username:password'
);

my %documents = (
    email       => 'tianxc@ihep.ac.cn',
    base_url    => 'http://dayabay.bnl.gov/private/documents/',
    test_url    => sub {  $_[0]->path !~ /\.(?:gif|jpeg|png|gz|root|dia)$/i },
    delay_sec   => '1',
    max_depth   => '1',
    credentials => 'username:password'
);

my %docdb = (
    email       => 'tianxc@ihep.ac.cn',
    base_url    => 'http://dayabay.ihep.ac.cn/cgi-bin/DocDB/ListBy?alldocs=1',
    test_url    => sub {  $_[0]->path !~ /\.(?:gif|jpeg|png|gz|root|dia)$/i },
    delay_sec   => '1',
    max_depth   => '1',
    credentials => 'username:password'
);

my %engdb = (
    email       => 'tianxc@ihep.ac.cn',
    base_url    => 'http://dayabay.ihep.ac.cn/cgi-bin/EngDB/ListBy?alldocs=1',
    test_url    => sub {  $_[0]->path !~ /\.(?:gif|jpeg|png|gz|root|dia)$/i },
    delay_sec   => '1',
    max_depth   => '1',
    credentials => 'username:password'
);

my %internal = (
    email       => 'tianxc@ihep.ac.cn',
    base_url    => 'http://dayabay.ihep.ac.cn/internal/',
    test_url    => sub {  $_[0]->path !~ /\.(?:gif|jpeg|png|gz|root|dia)$/i },
    delay_sec   => '1',
    max_depth   => '1',
    credentials => 'username:password'
);

my %publicwiki = (
    email       => 'tianxc@ihep.ac.cn',
    base_url    => 'https://wiki.bnl.gov/dayabay/index.php?title=Main_Page',
    test_url    => sub {  $_[0]->path !~ /\.(?:gif|jpeg|png|gz|root|dia)$/i },
    delay_sec   => '1',
    max_depth   => '2',
);

my %privatewiki = (
    email       => 'tianxc@ihep.ac.cn',
    base_url    => 'https://wiki.bnl.gov/dayabay-private/index.php?title=Main_Page',
    test_url    => sub {  $_[0]->path !~ /\.(?:gif|jpeg|png|gz|root|dia)$/i },
    delay_sec   => '1',
    max_depth   => '2',
    credentials => 'username:password'
);

my %repository  = (
    email       => 'tianxc@ihep.ac.cn',
    base_url    => 'http://dayabay.ihep.ac.cn/svn/dybsvn',
    test_url    => sub {  $_[0]->path !~ /\.(?:gif|jpeg|png|gz|root|pdf|ps|dia)$/i },
    delay_sec   => '1',
    max_depth   => '10',
    credentials => 'username:password'
);

my %trac  = (
    email       => 'tianxc@ihep.ac.cn',
    base_url    => 'http://dayabay.ihep.ac.cn/tracs/dybsvn',
    test_url    => sub {  $_[0]->path !~ /\.(?:gif|jpeg|png|gz|root|dia)$/i },
    delay_sec   => '1',
    max_depth   => '2',
    credentials => 'username:password'
);

@servers = ( \%dayabay, \%documents, \%docdb, \%internal, \%engdb, \%publicwiki, \%privatewiki, \%repository, \%trac );
1;

swish.conf

# Example configuration file

# Tell Swish-e what to index (same as -i switch above):
IndexDir spider.pl


# And pass the name of spider config file to the spider:
SwishProgParameters spider.conf


# Tell Swish-e that .txt files are to use the text parser:
IndexContents TXT* .txt


# Otherwise, use the HTML parser:
DefaultContents HTML*


# Ask libxml2 to report any parsing errors and warnings or
# any UTF-8 to 8859-1 conversion errors:
ParserWarnLevel 9

Indexing

touch indexing_time.file
/usr/local/bin/swish-e -c swish.conf -S prog

Trouble shooting

  • External program failed to return required headers Path-Name (Swish-e 2.4.5)
    Warning: document 'http://dayabay.ihep.ac.cn/svn/dybsvn/data/trunk/SimuAlg/output/neutron.output' has no content
    Warning: document 'http://dayabay.ihep.ac.cn/svn/dybsvn/db/trunk/DatabaseMaintenance/dict/dict.h' has no content
    Warning: document 'http://dayabay.ihep.ac.cn/svn/dybsvn/db/trunk/DatabaseMaintenance/dict/dict.xml' has no content
    Warning: document 'http://dayabay.ihep.ac.cn/svn/dybsvn/db/trunk/Util/src/Singleton.cc' has no content
    Warning: document 'http://dayabay.ihep.ac.cn/svn/dybsvn/dybgaudi/trunk/Control/DybCommon/ChangeLog' has no content
    Warning: document 'http://dayabay.ihep.ac.cn/svn/dybsvn/gaudi/trunk/Gaudi/options/empty.opts' has no content
    External Program found: /home/tianxc/local/lib/swish-e/spider.pl
    
    Warning: Unknown header line: 's' from program spider.pl
    
    Warning: Unknown header line: '//--------------------------------------------------------------' from program spider.pl
    
    Warning: Unknown header line: 'ApplicationMgr.EvtMax   = 1;' from program spider.pl
    
    Warning: Unknown header line: 'ApplicationMgr.EvtSel  = "NONE";' from program spider.pl
    err: External program failed to return required headers Path-Name:
    
    In spider.pl (see http://swish-e.org/archive/2007-03/11360.html), change:
    my $bytecount = length pack 'C0a*', $$content;
    
    to:
    my $bytecount = length($$content);
    
  • HTTP authorization with password protected Trac/mediawiki (TODO)

Right now, the following can not be indexed properly, need to be fixed

https://wiki.bnl.gov/dayabay-private/index.php?title=Main_Page
http://dayabay.ihep.ac.cn/tracs/dybsvn
http://dayabay.ihep.ac.cn/tracs/dybsvn/browser