/usr/bin/checkbot is in checkbot 1.80-3.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 | #!/usr/bin/perl -w
eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
if 0; # not running under some shell
#
# checkbot - A perl5 script to check validity of links in www document trees
#
# Hans de Graaff <hans@degraaff.org>, 1994-2005.
# Based on Dimitri Tischenko, Delft University of Technology, 1994
# Based on the testlinks script by Roy Fielding
# With contributions from Bruce Speyer <bruce.speyer@elecomm.com>
#
# This application is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
# Info-URL: http://degraaff.org/checkbot/
#
# $Id: checkbot 238 2008-10-15 12:55:00Z graaff $
# (Log information can be found at the end of the script)
require 5.004;
use strict;
require LWP;
use File::Basename;
BEGIN {
eval "use Time::Duration qw(duration)";
$main::useduration = ($@ ? 0 : 1);
}
# Version information
my
$VERSION = '1.80';
=head1 NAME
Checkbot - WWW Link Verifier
=head1 SYNOPSIS
checkbot [B<--cookies>] [B<--debug>] [B<--file> file name] [B<--help>]
[B<--mailto> email addresses] [B<--noproxy> list of domains]
[B<--verbose>]
[B<--url> start URL]
[B<--match> match string] [B<--exclude> exclude string]
[B<--proxy> proxy URL] [B<--internal-only>]
[B<--ignore> ignore string]
[B<--filter> substitution regular expression]
[B<--style> style file URL]
[B<--note> note] [B<--sleep> seconds] [B<--timeout> timeout]
[B<--interval> seconds] [B<--dontwarn> HTTP responde codes]
[B<--enable-virtual>]
[B<--language> language code]
[B<--suppress> suppression file]
[start URLs]
=head1 DESCRIPTION
Checkbot verifies the links in a specific portion of the World Wide
Web. It creates HTML pages with diagnostics.
Checkbot uses LWP to find URLs on pages and to check them. It supports
the same schemes as LWP does, and finds the same links that
HTML::LinkExtor will find.
Checkbot considers links to be either 'internal' or
'external'. Internal links are links within the web space that needs
to be checked. If an internal link points to a web document this
document is retrieved, and its links are extracted and
processed. External links are only checked to be working. Checkbot
checks links as it finds them, so internal and external links are
checked at the same time, even though they are treated differently.
Options for Checkbot are:
=over 4
=item --cookies
Accept cookies from the server and offer them again at later
requests. This may be useful for servers that use cookies to handle
sessions. By default Checkbot does not accept any cookies.
=item --debug
Enable debugging mode. Not really supported anymore, but it will keep
some files around that otherwise would be deleted.
=item --file <file name>
Use the file I<file name> as the basis for the summary file names. The
summary page will get the I<file name> given, and the server pages are
based on the I<file name> without the .html extension. For example,
setting this option to C<index.html> will create a summary page called
index.html and server pages called index-server1.html and
index-server2.html.
The default value for this option is C<checkbot.html>.
=item --help
Shows brief help message on the standard output.
=item --mailto <email address>[,<email address>]
Send mail to the I<email address> when Checkbot is done checking. You
can give more than one address separated by commas. The notification
email includes a small summary of the results. As of Checkbot 1.76
email is only sent if problems have been found during the Checkbot
run.
=item --noproxy <list of domains>
Do not proxy requests to the given domains. The list of domains must
be a comma-separated list. For example, so avoid using the proxy for
the localhost and someserver.xyz, you can use C<--noproxy
localhost,someserver.xyz>.
=item --verbose
Show verbose output while running. Includes all links checked, results
from the checks, etc.
=item --url <start URL>
Set the start URL. Checkbot starts checking at this URL, and then
recursively checks all links found on this page. The start URL takes
precedence over additional URLs specified on the command line.
If no scheme is specified for the URL, the file protocol is assumed.
=item --match <match string>
This option selects which pages Checkbot considers local. If the
I<match string> is contained within the URL, then Checkbot considers
the page local, retrieves it, and will check all the links contained
on it. Otherwise the page is considered external and it is only
checked with a HEAD request.
If no explicit I<match string> is given, the start URLs (See option
C<--url>) will be used as a match string instead. In this case the
last page name, if any, will be trimmed. For example, a start URL like
C<http://some.site/index.html> will result in a default I<match
string> of C<http://some.site/>.
The I<match string> can be a perl regular expression. For example, to
check the main server page and all HTML pages directly underneath it,
but not the HTML pages in the subdirectories of the server, the
I<match string> would be C<www.someserver.xyz/($|[^/]+.html)>.
=item --exclude <exclude string>
URLs matching the I<exclude string> are considered to be external,
even if they happen to match the I<match string> (See option
C<--match>). URLs matching the --exclude string are still being
checked and will be reported if problems are found, but they will not
be checked for further links into the site.
The I<exclude string> can be a perl regular expression. For example,
to consider all URLs with a query string external, use C<[=\?]>. This
can be useful when a URL with a query string unlocks the path to a
huge database which will be checked.
=item --filter <filter string>
This option defines a I<filter string>, which is a perl regular
expression. This filter is run on each URL found, thus rewriting the
URL before it enters the queue to be checked. It can be used to remove
elements from a URL. This option can be useful when symbolic links
point to the same directory, or when a content management system adds
session IDs to URLs.
For example C</old/new/> would replace occurrences of 'old' with 'new'
in each URL.
=item --ignore <ignore string>
URLs matching the I<ignore string> are not checked at all, they are
completely ignored by Checkbot. This can be useful to ignore known
problem links, or to ignore links leading into databases. The I<ignore
string> is matched after the I<filter string> has been applied.
The I<ignore string> can be a perl regular expression.
For example C<www.server.com\/(one|two)> would match all URLs starting
with either www.server.com/one or www.server.com/two.
=item --proxy <proxy URL>
This attribute specifies the URL of a proxy server. Only the HTTP and
FTP requests will be sent to that proxy server.
=item --internal-only
Skip the checking of external links at the end of the Checkbot
run. Only matching links are checked. Note that some redirections may
still cause external links to be checked.
=item --note <note>
The I<note> is included verbatim in the mail message (See option
C<--mailto>). This can be useful to include the URL of the summary HTML page
for easy reference, for instance.
Only meaningful in combination with the C<--mailto> option.
=item --sleep <seconds>
Number of I<seconds> to sleep in between requests. Default is 0
seconds, i.e. do not sleep at all between requests. Setting this
option can be useful to keep the load on the web server down while
running Checkbot. This option can also be set to a fractional number,
i.e. a value of 0.1 will sleep one tenth of a second between requests.
=item --timeout <timeout>
Default timeout for the requests, specified in seconds. The default is
2 minutes.
=item --interval <seconds>
The maximum interval between updates of the results web pages in
seconds. Default is 3 hours (10800 seconds). Checkbot will start the
interval at one minute, and gradually extend it towards the maximum
interval.
=item --style <URL of style file>
When this option is used, Checkbot embeds this URL as a link to a
style file on each page it writes. This makes it easy to customize the
layout of pages generated by Checkbot.
=item --dontwarn <HTTP response codes regular expression>
Do not include warnings on the result pages for those HTTP response
codes which match the regular expression. For instance, --dontwarn
"(301|404)" would not include 301 and 404 response codes.
Checkbot uses the response codes generated by the server, even if this
response code is not defined in RFC 2616 (HTTP/1.1). In addition to
the normal HTTP response code, Checkbot defines a few response codes
for situations which are not technically a problem, but which causes
problems in many cases anyway. These codes are:
901 Host name expected but not found
In this case the URL supports a host name, but non was found
in the URL. This usually indicates a mistake in the URL. An
exception is that this check is not applied to news: URLs.
902 Unqualified host name found
In this case the host name does not contain the domain part.
This usually means that the pages work fine when viewed within
the original domain, but not when viewed from outside it.
903 Double slash in URL path
The URL has a double slash in it. This is legal, but some web
servers cannot handle it very well and may cause Checkbot to
run away. See also the comments below.
904 Unknown scheme in URL
The URL starts with a scheme that Checkbot does not know
about. This is often caused by mistyping the scheme of the URL,
but the scheme can also be a legal one. In that case please let
me know so that it can be added to Checkbot.
=item --enable-virtual
This option enables dealing with virtual servers. Checkbot then
assumes that all hostnames for internal servers are unique, even
though their IP addresses may be the same. Normally Checkbot uses the
IP address to distinguish servers. This has the advantage that if a
server has two names (e.g. www and bamboozle) its pages only get
checked once. When you want to check multiple virtual servers this
causes problems, which this feature works around by using the hostname
to distinguish the server.
=item --language
The argument for this option is a two-letter language code. Checkbot
will use language negotiation to request files in that language. The
default is to request English language (language code 'en').
=item --suppress
The argument for this option is a file which contains combinations of
error codes and URLs for which to suppress warnings. This can be used
to avoid reporting of known and unfixable URL errors or warnings.
The format of the suppression file is a simple whitespace delimited
format, first listing the error code followed by the URL. Each error
code and URL combination is listed on a new line. Comments can be
added to the file by starting the line with a C<#> character.
# 301 Moved Permanently
301 http://www.w3.org/P3P
# 403 Forbidden
403 http://www.herring.com/
For further flexibility a regular expression can be used instead of a
normal URL. The regular expression must be enclosed with forward
slashes. For example, to suppress all 403 errors on wikipedia:
403 /http:\/\/wikipedia.org\/.*/
=back
Deprecated options which will disappear in a future release:
=over
=item --allow-simple-hosts (deprecated)
This option turns off warnings about URLs which contain unqualified
host names. This is useful for intranet sites which often use just a
simple host name or even C<localhost> in their links.
Use of this option is deprecated. Please use the --dontwarn mechanism
for error 902 instead.
=back
=head1 HINTS AND TIPS
=over
=item Problems with checking FTP links
Some users may experience consistent problems with checking FTP
links. In these cases it may be useful to instruct Net::FTP to use
passive FTP mode to check files. This can be done by setting the
environment variable FTP_PASSIVE to 1. For example, using the bash
shell: C<FTP_PASSIVE=1 checkbot ...>. See the Net::FTP documentation
for more details.
=item Run-away Checkbot
In some cases Checkbot literally takes forever to finish. There are two
common causes for this problem.
First, there might be a database application as part of the web site
which generates a new page based on links on another page. Since
Checkbot tries to travel through all links this will create an
infinite number of pages. This kind of run-away effect is usually predictable. It can be avoided by using the --exclude option.
Second, a server configuration problem can cause a loop in generating
URLs for pages that really do not exist. This will result in URLs of
the form http://some.server/images/images/images/logo.png, with ever
more 'images' included. Checkbot cannot check for this because the
server should have indicated that the requested pages do not
exist. There is no easy way to solve this other than fixing the
offending web server or the broken links.
=item Problems with https:// links
The error message
Can't locate object method "new" via package "LWP::Protocol::https::Socket"
usually means that the current installation of LWP does not support
checking of SSL links (i.e. links starting with https://). This
problem can be solved by installing the Crypt::SSLeay module.
=back
=head1 EXAMPLES
The most simple use of Checkbot is to check a set of pages on a
server. To check my checkbot pages I would use:
checkbot http://degraaff.org/checkbot/
Checkbot runs can take some time so Checkbot can send a notification
mail when the run is done:
checkbot --mailto hans@degraaff.org http://degraaff.org/checkbot/
It is possible to check a set of local file without using a web
server. This only works for static files but may be useful in some
cases.
checkbot file:///var/www/documents/
=head1 PREREQUISITES
This script uses the C<LWP> modules.
=head1 COREQUISITES
This script can send mail when C<Mail::Send> is present.
=head1 AUTHOR
Hans de Graaff <hans@degraaff.org>
=pod OSNAMES
any
=cut
# Declare some global variables, avoids ugly use of main:: all around
my %checkbot_errors = ('901' => 'Host name expected but not found',
'902' => 'Unqualified host name in URL',
'903' => 'URL contains double slash in URL',
'904' => 'Unknown scheme in URL',
);
my @starturls = ();
# Two hashes to store the response to a URL, and all the parents of the URL
my %url_error = ();
my %url_parent = ();
# Hash for storing the title of a URL for use in reports. TODO: remove
# this and store title as part of queue.
my %url_title = ();
# Hash for suppressions, which are defined as a combination of code and URL
my %suppression = ();
# Hash to store statistics on link checking
my %stats = ('todo' => 0,
'link' => 0,
'problem' => 0 );
# Options hash (to be filled by GetOptions)
my %options = ();
# Keep track of start time so that we can use it in reports
my $start_time = time();
# If on a Mac we should ask for the arguments through some MacPerl stuff
if ($^O eq 'MacOS') {
$main::mac_answer = eval "MacPerl::Ask('Enter Command-Line Options')";
push(@ARGV, split(' ', $main::mac_answer));
}
# Prepare
check_options();
init_modules();
init_globals();
init_suppression();
# Start actual application
check_links();
# Finish up
create_page(1);
send_mail() if defined $main::opt_mailto and $stats{problem} > 0;
exit 0;
# output prints stuff on stderr if --verbose, and takes care of proper
# indentation
sub output {
my ($line, $level) = @_;
return unless $main::opt_verbose;
chomp $line;
my $indent = '';
if (defined $level) {
while ($level-- > 0) {
$indent .= ' ';
}
}
print STDERR $indent, $line, "\n";
}
### Initialization and setup routines
sub check_options {
# Get command-line arguments
use Getopt::Long;
my $result = GetOptions(qw(cookies debug help noproxy=s verbose url=s match=s exclude|x=s file=s filter=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=f timeout=i interval=i dontwarn=s enable-virtual language=s allow-simple-hosts suppress=s));
# Handle arguments, some are mandatory, some have defaults
&print_help if (($main::opt_help && $main::opt_help)
|| (!$main::opt_url && $#ARGV == -1));
$main::opt_timeout = 120 unless defined($main::opt_timeout) && length($main::opt_timeout);
$main::opt_verbose = 0 unless $main::opt_verbose;
$main::opt_sleep = 0 unless defined($main::opt_sleep) && length($main::opt_sleep);
$main::opt_interval = 10800 unless defined $main::opt_interval and length $main::opt_interval;
$main::opt_dontwarn = "xxx" unless defined $main::opt_dontwarn and length $main::opt_dontwarn;
$main::opt_enable_virtual = 0 unless defined $main::opt_enable_virtual;
# Set the default language and make sure it is a two letter, lowercase code
$main::opt_language = 'en' unless defined $main::opt_language;
$main::opt_language = lc(substr($main::opt_language, 0, 2));
$main::opt_language =~ tr/a-z//cd;
if ($main::opt_language !~ /[a-z][a-z]/) {
warn "Argument --language $main::opt_language is not a valid language code\nUsing English as a default.\n";
$main::opt_language = 'en';
}
$main::opt_allow_simple_hosts = 0
unless $main::opt_allow_simple_hosts;
output "--allow-simple-hosts is deprecated, please use the --dontwarn mechanism", 0 if $main::opt_allow_simple_hosts;
# The default for opt_match will be set later, because we might want
# to muck with opt_url first.
# Display messages about the options
output "*** Starting Checkbot $VERSION in verbose mode";
output 'Will skip checking of external links', 1
if $main::opt_internal_only;
output "Allowing unqualified host names", 1
if $main::opt_allow_simple_hosts;
output "Not using optional Time::Duration module: not found", 1
unless $main::useduration;
}
sub init_modules {
use URI;
# Prepare the user agent to be used:
use LWP::UserAgent;
use LWP::MediaTypes;
#use LWP::Debug qw(- +debug);
use HTML::LinkExtor;
$main::ua = new LWP::UserAgent;
$main::ua->agent("Checkbot/$VERSION LWP/" . LWP::Version);
$main::ua->timeout($main::opt_timeout);
# Add a proxy to the user agent, if defined
$main::ua->proxy(['http', 'ftp'], $main::opt_proxy)
if defined($main::opt_proxy);
$main::ua->no_proxy(split(',', $main::opt_noproxy))
if defined $main::opt_noproxy;
# Add a cookie jar to the UA if requested by the user
$main::ua->cookie_jar( {} )
if defined $main::opt_cookies or $main::opt_cookies;
require Mail::Send if defined $main::opt_mailto;
use HTTP::Status;
}
sub init_globals {
my $url;
# Directory and files for output
if ($main::opt_file) {
$main::file = $main::opt_file;
$main::file =~ /(.*)\./;
$main::server_prefix = $1;
} else {
$main::file = "checkbot.html";
$main::server_prefix = "checkbot";
}
$main::tmpdir = ($ENV{'TMPDIR'} or $ENV{'TMP'} or $ENV{'TEMP'} or "/tmp") . "/Checkbot.$$";
$main::cur_queue = $main::tmpdir . "/queue";
$main::new_queue = $main::tmpdir . "/queue-new";
# Make sure we catch signals so that we can clean up temporary files
$SIG{'INT'} = $SIG{'TERM'} = $SIG{'HUP'} = $SIG{'QUIT'} = \&got_signal;
# Set up hashes to be used
%main::checked = ();
%main::servers = ();
%main::servers_get_only = ();
# Initialize the start URLs. --url takes precedence. Otherwise
# just process URLs in order as they appear on the command line.
unshift(@ARGV, $main::opt_url) if $main::opt_url;
foreach (@ARGV) {
$url = URI->new($_);
# If no scheme is defined we will assume file is used, so that
# it becomes easy to check a single file.
$url->scheme('file') unless defined $url->scheme;
$url->host('localhost') if $url->scheme eq 'file';
if (!defined $url->host) {
warn "No host specified in URL $url, ignoring it.\n";
next;
}
push(@starturls, $url);
}
die "There are no valid starting URLs to begin checking with!\n"
if scalar(@starturls) == -1;
# Set the automatic matching expression to a concatenation of the starturls
if (!defined $main::opt_match) {
my @matchurls;
foreach my $url (@starturls) {
# Remove trailing files from the match, e.g. remove index.html
# stuff so that we match on the host and/or directory instead,
# but only if there is a path component in the first place.
my $matchurl = $url->as_string;
$matchurl =~ s!/[^/]+$!/! unless $url->path eq '';
push(@matchurls, quotemeta $matchurl);
}
$main::opt_match = '^(' . join('|', @matchurls) . ')';
output "--match defaults to $main::opt_match";
}
# Initialize statistics hash with number of start URLs
$stats{'todo'} = scalar(@starturls);
# We write out our status every now and then.
$main::cp_int = 1;
$main::cp_last = 0;
}
sub init_suppression {
return if not defined $main::opt_suppress;
die "Suppression file \"$main::opt_suppress\" is in fact a directory"
if -d $main::opt_suppress;
open(SUPPRESSIONS, $main::opt_suppress)
or die "Unable to open $main::opt_suppress for reading: $!\n";
while (my $line = <SUPPRESSIONS>) {
chomp $line;
next if $line =~ /^#/ or $line =~ /^\s*$/;
if ($line !~ /^\s*(\d+)\s+(\S+)/) {
output "WARNING: Unable to parse line in suppression file $main::opt_suppress:\n $line\n";
} else {
output "Suppressed: $1 $2\n" if $main::opt_verbose;
$suppression{$1}{$2} = $2;
}
}
close SUPPRESSIONS;
}
### Main application code
sub check_links {
my $line;
mkdir $main::tmpdir, 0755
|| die "$0: unable to create directory $main::tmpdir: $!\n";
# Explicitly set the record separator. I had the problem that this
# was not defined under my perl 5.00502. This should fix that, and
# not cause problems for older versions of perl.
$/ = "\n";
open(CURRENT, ">$main::cur_queue")
|| die "$0: Unable to open CURRENT $main::cur_queue for writing: $!\n";
open(QUEUE, ">$main::new_queue")
|| die "$0: Unable to open QUEUE $main::new_queue for writing: $!\n";
# Prepare CURRENT queue with starting URLs
foreach (@starturls) {
print CURRENT $_->as_string . "|\n";
}
close CURRENT;
open(CURRENT, $main::cur_queue)
|| die "$0: Unable to open CURRENT $main::cur_queue for reading: $!\n";
do {
# Read a line from the queue, and process it
while (defined ($line = <CURRENT>) ) {
chomp($line);
&handle_url($line);
&check_point();
}
# Move queues around, and try again, but only if there are still
# things to do
output "*** Moving queues around, " . $stats{'todo'} . " links to do.";
close CURRENT
or warn "Error while closing CURRENT filehandle: $!\n";
close QUEUE;
# TODO: should check whether these succeed
unlink($main::cur_queue);
rename($main::new_queue, $main::cur_queue);
open(CURRENT, "$main::cur_queue")
|| die "$0: Unable to open $main::cur_queue for reading: $!\n";
open(QUEUE, ">$main::new_queue")
|| die "$0: Unable to open $main::new_queue for writing: $!\n";
} while (not -z $main::cur_queue);
close CURRENT;
close QUEUE;
unless (defined($main::opt_debug)) {
clean_up();
}
}
sub clean_up {
unlink $main::cur_queue, $main::new_queue;
rmdir $main::tmpdir;
output "Removed temporary directory $main::tmpdir and its contents.\n", 1;
}
sub got_signal {
my ($signalname) = @_;
clean_up() unless defined $main::opt_debug;
print STDERR "Caught SIG$signalname.\n";
exit 1;
}
# Whether URL is 'internal' or 'external'
sub is_internal ($) {
my ($url) = @_;
return ( $url =~ /$main::opt_match/o
and not (defined $main::opt_exclude and $url =~ /$main::opt_exclude/o));
}
sub handle_url {
my ($line) = @_;
my ($urlstr, $urlparent) = split(/\|/, $line);
my $reqtype;
my $response;
my $type;
$stats{'todo'}--;
# Add this URL to the ones we've seen already, return if it is a
# duplicate.
return if add_checked($urlstr);
$stats{'link'}++;
# Is this an external URL and we only check internal stuff?
return if defined $main::opt_internal_only
and not is_internal($urlstr);
my $url = URI->new($urlstr);
# Perhaps this is a URL we are not interested in checking...
if (not defined($url->scheme)
or $url->scheme !~ /^(https?|file|ftp|gopher|nntp)$/o ) {
# Ignore URLs which we know we can ignore, create error for others
if ($url->scheme =~ /^(news|mailto|javascript|mms)$/o) {
output "Ignore $url", 1;
} else {
add_error($urlstr, $urlparent, 904, "Unknown scheme in URL: "
. $url->scheme);
}
return;
}
# Guess/determine the type of document we might retrieve from this
# URL. We do this because we only want to use a full GET for HTML
# document. No need to retrieve images, etc.
if ($url->path =~ /\/$/o || $url->path eq "") {
$type = 'text/html';
} else {
$type = guess_media_type($url->path);
}
# application/octet-stream is the fallback of LWP's guess stuff, so
# if we get this then we ask the server what we got just to be sure.
if ($type eq 'application/octet-stream') {
$response = performRequest('HEAD', $url, $urlparent, $type, $main::opt_language);
$type = $response->content_type;
}
# Determine if this is a URL we should GET fully or partially (using HEAD)
if ($type =~ /html/o
&& $url->scheme =~ /^(https?|file|ftp|gopher)$/o
and is_internal($url->as_string)
&& (!defined $main::opt_exclude || $url !~ /$main::opt_exclude/o)) {
$reqtype = 'GET';
} else {
$reqtype = 'HEAD';
}
# Get the document, unless we already did while determining the type
$response = performRequest($reqtype, $url, $urlparent, $type, $main::opt_language)
unless defined($response) and $reqtype eq 'HEAD';
# Ok, we got something back from checking, let's see what it is
if ($response->is_success) {
select(undef, undef, undef, $main::opt_sleep)
unless $main::opt_debug || $url->scheme eq 'file';
# Internal HTML documents need to be given to handle_doc for processing
if ($reqtype eq 'GET' and is_internal($url->as_string)) {
handle_doc($response, $urlstr);
}
} else {
# Right, so it wasn't the smashing succes we hoped for, so bring
# the bad news and store the pertinent information for later
add_error($url, $urlparent, $response->code, $response->message);
if ($response->is_redirect and is_internal($url->as_string)) {
if ($response->code == 300) { # multiple choices, but no redirection available
output 'Multiple choices', 2;
} else {
my $baseURI = URI->new($url);
if (defined $response->header('Location')) {
my $redir_url = URI->new_abs($response->header('Location'), $baseURI);
output "Redirected to $redir_url", 2;
add_to_queue($redir_url, $urlparent);
$stats{'todo'}++;
} else {
output 'Location header missing from redirect response', 2;
}
}
}
}
# Done with this URL
}
sub performRequest {
my ($reqtype, $url, $urlparent, $type, $language) = @_;
my ($response);
# A better solution here would be to use GET exclusively. Here is how
# to do that. We would have to set this max_size thing in
# check_external, I guess...
# Set $ua->max_size(1) and then try a normal GET request. However,
# that doesn't always work as evidenced by an FTP server that just
# hangs in this case... Needs more testing to see if the timeout
# catches this.
# Normally, we would only need to do a HEAD, but given the way LWP
# handles gopher requests, we need to do a GET on those to get at
# least a 500 and 501 error. We would need to parse the document
# returned by LWP to find out if we had problems finding the
# file. -- Patch by Bruce Speyer <bspeyer@texas-one.org>
# We also need to do GET instead of HEAD if we know the remote
# server won't accept it. The standard way for an HTTP server to
# indicate this is by returning a 405 ("Method Not Allowed") or 501
# ("Not Implemented"). Other circumstances may also require sending
# GETs instead of HEADs to a server. Details are documented below.
# -- Larry Gilbert <larry@n2h2.com>
# Normally we try a HEAD request first, then a GET request if
# needed. There may be circumstances in which we skip doing a HEAD
# (e.g. when we should be getting the whole document).
foreach my $try ('HEAD', 'GET') {
# Skip trying HEAD when we know we need to do a GET or when we
# know only a GET will work anyway.
next if $try eq 'HEAD' and
($reqtype eq 'GET'
or $url->scheme eq 'gopher'
or (defined $url->authority and $main::servers_get_only{$url->authority}));
# Output what we are going to do with this link
output(sprintf("%4s %s (%s)\n", $try, $url, $type), 1);
# Create the request with all appropriate headers
my %header_hash = ( 'Referer' => $urlparent );
if (defined($language) && ($language ne '')) {
$header_hash{'Accept-Language'} = $language;
}
my $ref_header = new HTTP::Headers(%header_hash);
my $request = new HTTP::Request($try, $url, $ref_header);
$response = $main::ua->simple_request($request);
# If we are doing a HEAD request we need to make sure nothing
# fishy happened. we use some heuristics to see if we are ok, or
# if we should try again with a GET request.
if ($try eq 'HEAD') {
# 400, 405, 406 and 501 are standard indications that HEAD
# shouldn't be used
# We used to check for 403 here also, but according to the HTTP spec
# a 403 indicates that the server understood us fine but really does
# not want us to see the page, so we SHOULD NOT retry.
if ($response->code =~ /^(400|405|406|501)$/o) {
output "Server does not seem to like HEAD requests; retrying", 2;
$main::servers_get_only{$url->authority}++;
next;
};
# There are many servers out there that have real trouble with
# HEAD, so if we get a 500 Internal Server error just retry with
# a GET request to get an authoritive answer. We used to do this
# only for special cases, but the list got big and some
# combinations (e.g. Zope server behind Apache proxy) can't
# easily be detected from the headers.
if ($response->code =~ /^500$/o) {
output "Internal server error on HEAD request; retrying with GET", 2;
$main::servers_get_only{$url->authority}++ if defined $url->authority;
next;
}
# If we know the server we can try some specific heuristics
if (defined $response->server) {
# Netscape Enterprise has been seen returning 500 and even 404
# (yes, 404!!) in response to HEAD requests
if ($response->server =~ /^Netscape-Enterprise/o
and $response->code =~ /^404$/o) {
output "Unreliable Netscape-Enterprise response to HEAD request; retrying", 2;
$main::servers_get_only{$url->authority}++;
next;
};
}
# If a HEAD request resulted in nothing noteworthy, no need for
# any further attempts using GET, we are done.
last;
}
}
return $response;
}
# This routine creates a (temporary) WWW page based on the current
# findings This allows somebody to monitor the process, but is also
# convenient when this program crashes or waits because of diskspace
# or memory problems
sub create_page {
my($final_page) = @_;
my $path = "";
my $prevpath = "";
my $prevcode = 0;
my $prevmessage = "";
output "*** Start writing results page";
open(OUT, ">$main::file.new")
|| die "$0: Unable to open $main::file.new for writing:\n";
print OUT "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n";
print OUT "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n";
print OUT "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n";
print OUT "<head>\n";
if (!$final_page) {
printf OUT "<meta http-equiv=\"Refresh\" content=\"%d\" />\n",
int($main::cp_int * 60 / 2 - 5);
}
print OUT "<title>Checkbot report</title>\n";
print OUT "<link rel=\"stylesheet\" type=\"text/css\" href=\"$main::opt_style\" />\n" if defined $main::opt_style;
print OUT "</head>\n";
print OUT "<body>\n";
print OUT "<h1><em>Checkbot</em>: main report</h1>\n";
# Show the status of this checkbot session
print OUT "<table summary=\"Status of this Checkbot session\" class='status'><tr><th>Status:</th><td>";
if ($final_page) {
print OUT "Done.<br />\n";
print OUT 'Run started on ' . localtime($start_time) . ".<br />\n";
print OUT 'Run duration ', duration(time() - $start_time), ".\n"
if $main::useduration;
} else {
print OUT "Running since " . localtime($start_time) . ".<br />\n";
print OUT "Last update at ". localtime() . ".<br />\n";
print OUT "Next update in <strong>", int($main::cp_int), "</strong> minutes.\n";
}
print OUT "</td></tr></table>\n\n";
# Summary (very brief overview of key statistics)
print OUT "<hr /><h2 class='summary'>Report summary</h2>\n";
print OUT "<table summary=\"Report summary\" class='summary'>\n";
print OUT "<tr id='checked'><th>Links checked</th><td class='value'>", $stats{'link'}, "</td></tr>\n";
print OUT "<tr id='problems'><th>Problems so far</th><td class='value'>", $stats{'problem'}, "</td></tr>\n";
print OUT "<tr id='todo'><th>Links to do</th><td class='value'>", $stats{'todo'}, "</td></tr>\n";
print OUT "</table>\n";
# Server information
printAllServers($final_page);
# Checkbot session parameters
print OUT "<hr /><h2 class='params'>Checkbot session parameters</h2>\n";
print OUT "<table summary=\"Checkbot session parameters\" class='params'>\n";
print OUT "<tr><th align=\"left\">--url &<br/> <command line urls></th><td class='text'>Start URL(s)</td><td class='value' id='url'>",
join(',', @starturls), "</td></tr>\n";
print OUT "<tr><th align=\"left\">--match</th><td class='text'>Match regular expression</td><td class='value' id='match'>$main::opt_match</td></tr>\n";
print OUT "<tr><th align=\"left\">--exclude</th><td class='text'>Exclude regular expression</td><td class='value' id='exclude'>$main::opt_exclude</td></tr>\n" if defined $main::opt_exclude;
print OUT "<tr><th align=\"left\">--filter</th><td class='text'>Filter regular expression</td><td class='value' id='filter'>$main::opt_filter</td></tr>\n" if defined $main::opt_filter;
print OUT "<tr><th align=\"left\">--noproxy</th><td class='text'>No Proxy for the following domains</td><td class='value' id='noproxy'>$main::opt_noproxy</td></tr>\n" if defined $main::opt_noproxy;
print OUT "<tr><th align=\"left\">--ignore</th><td class='text'>Ignore regular expression</td><td class='value' id='ignore'>$main::opt_ignore</td></tr>\n" if defined $main::opt_ignore;
print OUT "<tr><th align=\"left\">--suppress</th><td class='text'>Suppress error code and URL specified by</td><td class='value' id='suppress'>$main::opt_suppress</td></tr>\n" if defined $main::opt_suppress;
print OUT "<tr><th align=\"left\">--dontwarn</th><td class='text'>Don't warn for these codes</td><td class='value' id='dontwarn'>$main::opt_dontwarn</td></tr>\n" if $main::opt_dontwarn ne 'xxx';
print OUT "<tr><th align=\"left\">--enable-virtual</th><td class='text'>Use virtual names only</td><td class='value' id='enable_virtual'>yes</td></tr>\n" if $main::opt_enable_virtual;
print OUT "<tr><th align=\"left\">--internal-only</th><td class='text'>Check only internal links</td><td class='value' id='internal_only'>yes</td></tr>\n" if defined $main::opt_internal_only;
print OUT "<tr><th align=\"left\">--cookies</th><td class='text'>Accept cookies</td><td class='value' id='cookies'>yes</td></tr>\n" if defined $main::opt_cookies;
print OUT "<tr><th align=\"left\">--sleep</th><td class='text'>Sleep seconds between requests</td><td class='value' id='sleep'>$main::opt_sleep</td></tr>\n" if ($main::opt_sleep != 0);
print OUT "<tr><th align=\"left\">--timeout</th><td class='text'>Request timeout seconds</td><td class='value' id='timeout'>$main::opt_timeout</td></tr>\n";
print OUT "</table>\n";
# Statistics for types of links
print OUT signature();
close(OUT);
rename($main::file, $main::file . ".bak");
rename($main::file . ".new", $main::file);
unlink $main::file . ".bak" unless $main::opt_debug;
output "*** Done writing result page";
}
# Create a list of all the servers, and create the corresponding table
# and subpages. We use the servers overview for this. This can result
# in strange effects when the same server (e.g. IP address) has
# several names, because several entries will appear. However, when
# using the IP address there are also a number of tricky situations,
# e.g. with virtual hosting. Given that likely the servers have
# different names for a reasons, I think it is better to have
# duplicate entries in some cases, instead of working off of the IP
# addresses.
sub printAllServers {
my ($finalPage) = @_;
my $server;
print OUT "<hr /><h2 class='overview'>Overview per server</h2>\n";
print OUT "<table summary=\"Overview per server\" class='overview'><tr><th>Server</th><th>Server<br />Type</th><th>Documents<br />scanned</th><th>Problem<br />links</th><th>Ratio</th></tr>\n";
foreach $server (sort keys %main::servers) {
print_server($server, $finalPage);
}
print OUT "</table>\n\n";
}
sub get_server_type {
my($server) = @_;
my $result;
if ( ! defined($main::server_type{$server})) {
if ($server eq 'localhost') {
$result = 'Direct access through filesystem';
} else {
my $request = new HTTP::Request('HEAD', "http://$server/");
my $response = $main::ua->simple_request($request);
$result = $response->header('Server');
}
$result = "Unknown server type" if ! defined $result or $result eq "";
output "=== Server $server is a $result";
$main::server_type{$server} = $result;
}
$main::server_type{$server};
}
sub add_checked {
my($urlstr) = @_;
my $item;
my $result = 0;
if (is_internal($urlstr) and not $main::opt_enable_virtual) {
# Substitute hostname with IP-address. This keeps us from checking
# the same pages for each name of the server, wasting time & resources.
# Only do this if we are not dealing with virtual servers. Also, we
# only do this for internal servers, because it makes no sense for
# external links.
my $url = URI->new($urlstr);
$url->host(ip_address($url->host)) if $url->can('host');
$urlstr = $url->as_string;
}
if (defined $main::checked{$urlstr}) {
$result = 1;
$main::checked{$urlstr}++;
} else {
$main::checked{$urlstr} = 1;
}
return $result;
}
# Has this URL already been checked?
sub is_checked {
my ($urlstr) = @_;
if (is_internal($urlstr) and not $main::opt_enable_virtual) {
# Substitute hostname with IP-address. This keeps us from checking
# the same pages for each name of the server, wasting time & resources.
# Only do this if we are not dealing with virtual servers. Also, we
# only do this for internal servers, because it makes no sense for
# external links.
my $url = URI->new($urlstr);
$url->host(ip_address($url->host)) if $url->can('host');
$urlstr = $url->as_string;
}
return defined $main::checked{$urlstr};
}
sub add_error ($$$$) {
my ($url, $urlparent, $code, $status) = @_;
# Check for the quick eliminations first
return if $code =~ /$main::opt_dontwarn/o
or defined $suppression{$code}{$url};
# Check for matches on the regular expressions in the supression file
if (defined $suppression{$code}) {
foreach my $item ( %{$suppression{$code}} ) {
if ($item =~ /^\/(.*)\/$/) {
my $regexp = $1;
if ($url =~ $regexp) {
output "Supressing error $code for $url due to regular expression match on $regexp", 2;
return;
}
}
}
}
$status = checkbot_status_message($code) if not defined $status;
output "$code $status", 2;
$url_error{$url}{'code'} = $code;
$url_error{$url}{'status'} = $status;
push @{$url_parent{$url}}, $urlparent;
$stats{'problem'}++;
}
# Parse document, and get the links
sub handle_doc {
my ($response, $urlstr) = @_;
my $num_links = 0;
my $new_links = 0;
# TODO: we are making an assumption here that the $reponse->base is
# valid, which might not always be true! This needs to be fixed, but
# first let's try to find out why this stuff is sometimes not
# valid... Aha. a simple <base href="news:"> will do the trick. It is
# not clear what the right fix for this is.
# We use the URL we used to retrieve this document as the URL to
# attach the problem reports to, even though this may not be the
# proper base url.
my $baseurl = URI->new($urlstr);
# When we received the document we can add a notch to its server
$main::servers{$baseurl->authority}++;
# Retrieve useful information from this document.
# TODO: using a regexp is NOT how this should be done, but it is
# easy. The right way would be to write a HTML::Parser or to use
# XPath on the document DOM provided that the document is easily
# parsed as XML. Either method is a lot of overhead.
if ($response->content =~ /title\>(.*?)\<\/title/si) {
# TODO: using a general hash that stores titles for all pages may
# consume too much memory. It would be better to only store the
# titles for requests that had problems. That requires passing them
# down to the queue. Take the easy way out for now.
$url_title{$baseurl} = $1;
}
# Check if this document has a Robots META tag. If so, check if
# Checkbot is allowed to FOLLOW the links on this page. Note that we
# ignore the INDEX directive because Checkbot is not an indexing
# robot. See http://www.robotstxt.org/wc/meta-user.html
# TODO: one more reason (see title) to properly parse this document...
if ($response->content =~ /\<meta[^\>]*?robots[^\>]*?nofollow[^\>]*?\>/si) {
output "Obeying robots meta tag $&, skipping document", 2;
return;
}
# Parse the document just downloaded, using the base url as defined
# in the response, otherwise we won't get the same behavior as
# browsers and miss things like a BASE url in pages.
my $p = HTML::LinkExtor->new(undef, $response->base);
# If charset information is missing then decoded_content doesn't
# work. Fall back to content in this case, even though that may lead
# to charset warnings. See bug 1665075 for reference.
my $content = $response->decoded_content || $response->content;
$p->parse($content);
$p->eof;
# Deal with the links we found in this document
my @links = $p->links();
foreach (@links) {
my ($tag, %l) = @{$_};
foreach (keys %l) {
# Get the canonical URL, so we don't need to worry about base, case, etc.
my $url = $l{$_}->canonical;
# Remove fragments, if any
$url->fragment(undef);
# Determine in which tag this URL was found
# Ignore <base> tags because they need not point to a valid URL
# in order to work (e.g. when directory indexing is turned off).
next if $tag eq 'base';
# Skip some 'links' that are not required to link to an actual
# live link but which LinkExtor returns as links anyway.
next if $tag eq 'applet' and $_ eq 'code';
next if $tag eq 'object' and $_ eq 'classid';
# Run filter on the URL if defined
if (defined $main::opt_filter) {
die "Filter supplied with --filter option contains errors!\n$@\n"
unless defined eval '$url =~ s' . $main::opt_filter
}
# Should we ignore this URL?
if (defined $main::opt_ignore and $url =~ /$main::opt_ignore/o) {
output "--ignore: $url", 1;
next;
}
# Check whether URL has fully-qualified hostname
if ($url->can('host') and $url->scheme ne 'news') {
if (! defined $url->host) {
add_error($url, $baseurl->as_string, '901',
$checkbot_errors{'901'});
} elsif (!$main::opt_allow_simple_hosts && $url->host !~ /\./) {
add_error($url, $baseurl->as_string, '902',
$checkbot_errors{'902'});
}
}
# Some servers do not process // correctly in requests for relative
# URLs. We should flag them here. Note that // in a URL path is
# actually valid per RFC 2396, and that they should not be removed
# when processing relative URLs as per RFC 1808. See
# e.g. <http://deesse.univ-lemans.fr:8003/Connected/RFC/1808/18.html>.
# Thanks to Randal Schwartz and Reinier Post for their explanations.
if ($url =~ /^http:\/\/.*\/\//) {
add_error($url, $baseurl->as_string, '903',
$checkbot_errors{'903'});
}
# We add all URLs found to the queue, unless we already checked
# it earlier
if (is_checked($url)) {
# If an error has already been logged for this URL we add the
# current parent to the list of parents on which this URL
# appears.
if (defined $url_error{$url}) {
push @{$url_parent{$url}}, $baseurl->as_string;
$stats{'problem'}++;
}
$stats{'link'}++;
} else {
add_to_queue($url, $baseurl);
$stats{'todo'}++;
$new_links++;
}
$num_links++;
}
}
output "Got $num_links links ($new_links new) from document", 2;
}
sub add_to_queue {
my ($url, $parent) = @_;
print QUEUE $url . '|' . $parent . "\n";
}
sub checkbot_status_message ($) {
my ($code) = @_;
my $result = status_message($code) || $checkbot_errors{$code}
|| '(Undefined status)';
}
sub print_server ($$) {
my($server, $final_page) = @_;
my $host = $server;
$host =~ s/(.*):\d+/$1/;
output "Writing server $server (really " . ip_address($host) . ")", 1;
my $server_problem = count_problems($server);
my $filename = "$main::server_prefix-$server.html";
$filename =~ s/:/-/o;
print OUT "<tr><td class='server'>";
print OUT "<a href=\"@{[ (fileparse($filename))[0] ]}\">" if $server_problem > 0;
print OUT "$server";
print OUT "</a>" if $server_problem > 0;
print OUT "</td>";
print OUT "<td class='servertype'>" . get_server_type($server) . "</td>";
printf OUT "<td class='unique' align=\"right\">%d</td>",
$main::servers{$server} + $server_problem;
if ($server_problem) {
printf OUT "<td class='problems' id='oops' align=\"right\">%d</td>",
$server_problem;
} else {
printf OUT "<td class='problems' id='zero_defects' align=\"right\">%d</td>",
$server_problem;
}
my $ratio = $server_problem / ($main::servers{$server} + $server_problem) * 100;
print OUT "<td class='ratio' align=\"right\">";
print OUT "<strong>" unless $ratio < 0.5;
printf OUT "%4d%%", $ratio;
print OUT "</strong>" unless $ratio < 0.5;
print OUT "</td>";
print OUT "</tr>\n";
# Create this server file
open(SERVER, ">$filename")
|| die "Unable to open server file $filename for writing: $!";
print SERVER "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n";
print SERVER "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n";
print SERVER "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n";
print SERVER "<head>\n";
if (!$final_page) {
printf SERVER "<meta http-equiv=\"Refresh\" content=\"%d\" />\n",
int($main::cp_int * 60 / 2 - 5);
}
print SERVER "<link rel=\"stylesheet\" type=\"text/css\" href=\"$main::opt_style\" />\n" if defined $main::opt_style;
print SERVER "<title>Checkbot: output for server $server</title></head>\n";
print SERVER "<body><h2><em>Checkbot</em>: report for server <tt>$server</tt></h2>\n";
print SERVER "<p>Go To: <a href=\"@{[ (fileparse($main::file))[0] ]}\">Main report page</a>";
printServerProblems($server, $final_page);
print SERVER "\n";
print SERVER signature();
close SERVER;
}
# Return a string containing Checkbot's signature for HTML pages
sub signature {
return "<hr />\n<p class='signature'>Page created by <a href=\"http://degraaff.org/checkbot/\">Checkbot $VERSION</a> on <em>" . localtime() . "</em>.</p>\n".
"<p><a href=\"http://validator.w3.org/check/?uri=referer\"><img src=\"http://www.w3.org/Icons/valid-xhtml11\" alt=\"Valid XHTML 1.1\" height=\"31\" width=\"88\" /></a></p>".
"</body></html>";
}
# Loop through all possible problems, select relevant ones for this server
# and display them in a meaningful way.
sub printServerProblems ($$) {
my ($server, $final_page) = @_;
$server = quotemeta $server;
my $separator = "<hr />\n";
my %thisServerList = ();
# First we find all the problems for this particular server
foreach my $url (keys %url_parent) {
foreach my $parent (@{$url_parent{$url}}) {
next if $parent !~ $server;
chomp $parent;
$thisServerList{$url_error{$url}{'code'}}{$parent}{$url}
= $url_error{$url}{'status'};
}
}
# Do a run to find all error codes on this page, and include a table
# of contents to the actual report
foreach my $code (sort keys %thisServerList) {
print SERVER ", <a href=\"#rc$code\">$code ";
print SERVER checkbot_status_message($code);
print SERVER "</a>";
}
print SERVER ".</p>\n";
# Now run through this list and print the errors
foreach my $code (sort keys %thisServerList) {
my $codeOut = '';
foreach my $parent (sort keys %{ $thisServerList{$code} }) {
my $urlOut = '';
foreach my $url (sort keys %{ $thisServerList{$code}{$parent} }) {
my $status = $thisServerList{$code}{$parent}{$url};
$urlOut .= "<li><a href=\"$url\">$url</a><br/>\n";
$urlOut .= "$status"
if defined $status and $status ne checkbot_status_message($code);
$urlOut .= "</li>\n";
}
if ($urlOut ne '') {
$codeOut .= "<dt><a href=\"$parent\">$parent</a>";
$codeOut .= "<br />$url_title{$parent}\n" if defined $url_title{$parent};
$codeOut .= "<dd><ul>\n$urlOut\n</ul>\n\n";
}
}
if ($codeOut ne '') {
print SERVER $separator if $separator;
$separator = '';
print SERVER "<h4 id=\"rc$code\">$code ";
print SERVER checkbot_status_message($code);
print SERVER "</h4>\n<dl>\n$codeOut\n</dl>\n";
}
}
}
sub check_point {
if ( ($main::cp_last + 60 * $main::cp_int < time())
|| ($main::opt_debug && $main::opt_verbose)) {
&create_page(0);
$main::cp_last = time();
# Increase the intervall from one snapshot to the next by 25%
# until we have reached the maximum.
$main::cp_int *= 1.25 unless $main::opt_debug;
$main::cp_int = $main::opt_interval if $main::cp_int > $main::opt_interval;
}
}
sub send_mail {
my $msg = new Mail::Send;
my $sub = 'Checkbot results for ';
$sub .= join(', ', @starturls);
$sub .= ': ' . $stats{'problem'} . ' errors';
$msg->to($main::opt_mailto);
$msg->subject($sub);
my $fh = $msg->open;
print $fh "Checkbot results for:\n " . join("\n ", @starturls) . "\n\n";
print $fh "User-supplied note: $main::opt_note\n\n"
if defined $main::opt_note;
print $fh $stats{'link'}, " links were checked, and ";
print $fh $stats{'problem'}, " problems were detected.\n";
print $fh 'Run started on ' . localtime($start_time) . "\n";
print $fh 'Run duration ', duration(time() - $start_time), "\n"
if $main::useduration;
print $fh "\n-- \nCheckbot $VERSION\n";
print $fh "<URL:http://degraaff.org/checkbot/>\n";
$fh->close;
}
sub print_help {
print <<"__EOT__";
Checkbot $VERSION command line options:
--cookies Accept cookies from the server
--debug Debugging mode: No pauses, stop after 25 links.
--file file Use file as basis for output file names.
--help Provide this message.
--mailto address Mail brief synopsis to address when done.
--noproxy domains Do not proxy requests to given domains.
--verbose Verbose mode: display many messages about progress.
--url url Start URL
--match match Check pages only if URL matches `match'
If no match is given, the start URL is used as a match
--exclude exclude Exclude pages if the URL matches 'exclude'
--filter regexp Run regexp on each URL found
--ignore ignore Ignore URLs matching 'ignore'
--suppress file Use contents of 'file' to suppress errors in output
--note note Include Note (e.g. URL to report) along with Mail message.
--proxy URL URL of proxy server for HTTP and FTP requests.
--internal-only Only check internal links, skip checking external links.
--sleep seconds Sleep this many seconds between requests (default 0)
--style url Reference the style sheet at this URL.
--timeout seconds Timeout for http requests in seconds (default 120)
--interval seconds Maximum time interval between updates (default 10800)
--dontwarn codes Do not write warnings for these HTTP response codes
--enable-virtual Use only virtual names, not IP numbers for servers
--language Specify 2-letter language code for language negotiation
Options --match, --exclude, and --ignore can take a perl regular expression
as their argument\n
Use 'perldoc checkbot' for more verbose documentation.
Checkbot WWW page : http://degraaff.org/checkbot/
Mail bugs and problems: checkbot\@degraaff.org
__EOT__
exit 0;
}
sub ip_address {
my($host) = @_;
return $main::ip_cache{$host} if defined $main::ip_cache{$host};
my($name,$aliases,$adrtype,$length,@addrs) = gethostbyname($host);
if (defined $addrs[0]) {
my($n1,$n2,$n3,$n4) = unpack ('C4',$addrs[0]);
$main::ip_cache{$host} = "$n1.$n2.$n3.$n4";
} else {
# Whee! No IP-address found for this host. Just keep whatever we
# got for the host. If this really is some kind of error it will
# be found later on.
$main::ip_cache{$host} = $host;
}
}
sub count_problems {
my ($server) = @_;
$server = quotemeta $server;
my $count = 0;
foreach my $url (sort keys %url_parent) {
foreach my $parent (@{ $url_parent{$url} }) {
$count++ if $parent =~ m/$server/;
}
}
return $count;
}
|