This file is indexed.

/usr/bin/checkbot is in checkbot 1.80-3.

This file is owned by root:root, with mode 0o755.

The actual contents of the file can be viewed below.

   1
   2
   3
   4
   5
   6
   7
   8
   9
  10
  11
  12
  13
  14
  15
  16
  17
  18
  19
  20
  21
  22
  23
  24
  25
  26
  27
  28
  29
  30
  31
  32
  33
  34
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71
  72
  73
  74
  75
  76
  77
  78
  79
  80
  81
  82
  83
  84
  85
  86
  87
  88
  89
  90
  91
  92
  93
  94
  95
  96
  97
  98
  99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
#!/usr/bin/perl -w

eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
    if 0; # not running under some shell
#
# checkbot - A perl5 script to check validity of links in www document trees
#
# Hans de Graaff <hans@degraaff.org>, 1994-2005.
# Based on Dimitri Tischenko, Delft University of Technology, 1994
# Based on the testlinks script by Roy Fielding
# With contributions from Bruce Speyer <bruce.speyer@elecomm.com>
#
# This application is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
# Info-URL: http://degraaff.org/checkbot/
#
# $Id: checkbot 238 2008-10-15 12:55:00Z graaff $
# (Log information can be found at the end of the script)

require 5.004;
use strict;

require LWP;
use File::Basename;

BEGIN {
  eval "use Time::Duration qw(duration)";
  $main::useduration = ($@ ? 0 : 1);
}

# Version information
my
$VERSION = '1.80';


=head1 NAME

Checkbot - WWW Link Verifier

=head1 SYNOPSIS

checkbot [B<--cookies>] [B<--debug>] [B<--file> file name] [B<--help>]
         [B<--mailto> email addresses] [B<--noproxy> list of domains]
         [B<--verbose>]
         [B<--url> start URL]
         [B<--match> match string] [B<--exclude> exclude string]
         [B<--proxy> proxy URL] [B<--internal-only>]
         [B<--ignore> ignore string]
         [B<--filter> substitution regular expression]
         [B<--style> style file URL]
         [B<--note> note] [B<--sleep> seconds] [B<--timeout> timeout]
         [B<--interval> seconds] [B<--dontwarn> HTTP responde codes]
         [B<--enable-virtual>]
         [B<--language> language code]
         [B<--suppress> suppression file]
         [start URLs]

=head1 DESCRIPTION

Checkbot verifies the links in a specific portion of the World Wide
Web. It creates HTML pages with diagnostics.

Checkbot uses LWP to find URLs on pages and to check them. It supports
the same schemes as LWP does, and finds the same links that
HTML::LinkExtor will find.

Checkbot considers links to be either 'internal' or
'external'. Internal links are links within the web space that needs
to be checked. If an internal link points to a web document this
document is retrieved, and its links are extracted and
processed. External links are only checked to be working.  Checkbot
checks links as it finds them, so internal and external links are
checked at the same time, even though they are treated differently.

Options for Checkbot are:

=over 4

=item --cookies

Accept cookies from the server and offer them again at later
requests. This may be useful for servers that use cookies to handle
sessions. By default Checkbot does not accept any cookies.

=item --debug

Enable debugging mode. Not really supported anymore, but it will keep
some files around that otherwise would be deleted.

=item --file <file name>

Use the file I<file name> as the basis for the summary file names. The
summary page will get the I<file name> given, and the server pages are
based on the I<file name> without the .html extension. For example,
setting this option to C<index.html> will create a summary page called
index.html and server pages called index-server1.html and
index-server2.html.

The default value for this option is C<checkbot.html>.

=item --help

Shows brief help message on the standard output.

=item --mailto <email address>[,<email address>]

Send mail to the I<email address> when Checkbot is done checking. You
can give more than one address separated by commas. The notification
email includes a small summary of the results. As of Checkbot 1.76
email is only sent if problems have been found during the Checkbot
run.

=item --noproxy <list of domains>

Do not proxy requests to the given domains. The list of domains must
be a comma-separated list. For example, so avoid using the proxy for
the localhost and someserver.xyz, you can use C<--noproxy
localhost,someserver.xyz>.

=item --verbose

Show verbose output while running. Includes all links checked, results
from the checks, etc.





=item --url <start URL>

Set the start URL. Checkbot starts checking at this URL, and then
recursively checks all links found on this page. The start URL takes
precedence over additional URLs specified on the command line.

If no scheme is specified for the URL, the file protocol is assumed.

=item --match <match string>

This option selects which pages Checkbot considers local. If the
I<match string> is contained within the URL, then Checkbot considers
the page local, retrieves it, and will check all the links contained
on it. Otherwise the page is considered external and it is only
checked with a HEAD request.

If no explicit I<match string> is given, the start URLs (See option
C<--url>) will be used as a match string instead. In this case the
last page name, if any, will be trimmed. For example, a start URL like
C<http://some.site/index.html> will result in a default I<match
string> of C<http://some.site/>.

The I<match string> can be a perl regular expression.  For example, to
check the main server page and all HTML pages directly underneath it,
but not the HTML pages in the subdirectories of the server, the
I<match string> would be C<www.someserver.xyz/($|[^/]+.html)>.

=item --exclude <exclude string>

URLs matching the I<exclude string> are considered to be external,
even if they happen to match the I<match string> (See option
C<--match>). URLs matching the --exclude string are still being
checked and will be reported if problems are found, but they will not
be checked for further links into the site.

The I<exclude string> can be a perl regular expression. For example,
to consider all URLs with a query string external, use C<[=\?]>. This
can be useful when a URL with a query string unlocks the path to a
huge database which will be checked.

=item --filter <filter string>

This option defines a I<filter string>, which is a perl regular
expression. This filter is run on each URL found, thus rewriting the
URL before it enters the queue to be checked. It can be used to remove
elements from a URL. This option can be useful when symbolic links
point to the same directory, or when a content management system adds
session IDs to URLs.

For example C</old/new/> would replace occurrences of 'old' with 'new'
in each URL.

=item --ignore <ignore string>

URLs matching the I<ignore string> are not checked at all, they are
completely ignored by Checkbot. This can be useful to ignore known
problem links, or to ignore links leading into databases. The I<ignore
string> is matched after the I<filter string> has been applied.

The I<ignore string> can be a perl regular expression.

For example C<www.server.com\/(one|two)> would match all URLs starting
with either www.server.com/one or www.server.com/two.


=item --proxy <proxy URL>

This attribute specifies the URL of a proxy server. Only the HTTP and
FTP requests will be sent to that proxy server.

=item --internal-only

Skip the checking of external links at the end of the Checkbot
run. Only matching links are checked. Note that some redirections may
still cause external links to be checked.

=item --note <note>

The I<note> is included verbatim in the mail message (See option
C<--mailto>). This can be useful to include the URL of the summary HTML page
for easy reference, for instance.

Only meaningful in combination with the C<--mailto> option.

=item --sleep <seconds>

Number of I<seconds> to sleep in between requests. Default is 0
seconds, i.e. do not sleep at all between requests. Setting this
option can be useful to keep the load on the web server down while
running Checkbot. This option can also be set to a fractional number,
i.e. a value of 0.1 will sleep one tenth of a second between requests.

=item --timeout <timeout>

Default timeout for the requests, specified in seconds. The default is
2 minutes.

=item --interval <seconds>

The maximum interval between updates of the results web pages in
seconds. Default is 3 hours (10800 seconds). Checkbot will start the
interval at one minute, and gradually extend it towards the maximum
interval.

=item --style <URL of style file>

When this option is used, Checkbot embeds this URL as a link to a
style file on each page it writes. This makes it easy to customize the
layout of pages generated by Checkbot.

=item --dontwarn <HTTP response codes regular expression>

Do not include warnings on the result pages for those HTTP response
codes which match the regular expression. For instance, --dontwarn
"(301|404)" would not include 301 and 404 response codes.

Checkbot uses the response codes generated by the server, even if this
response code is not defined in RFC 2616 (HTTP/1.1). In addition to
the normal HTTP response code, Checkbot defines a few response codes
for situations which are not technically a problem, but which causes
problems in many cases anyway. These codes are:

  901 Host name expected but not found
      In this case the URL supports a host name, but non was found
      in the URL. This usually indicates a mistake in the URL. An
      exception is that this check is not applied to news: URLs.

  902 Unqualified host name found
      In this case the host name does not contain the domain part.
      This usually means that the pages work fine when viewed within
      the original domain, but not when viewed from outside it.

  903 Double slash in URL path
      The URL has a double slash in it. This is legal, but some web
      servers cannot handle it very well and may cause Checkbot to
      run away. See also the comments below.

  904 Unknown scheme in URL
      The URL starts with a scheme that Checkbot does not know
      about. This is often caused by mistyping the scheme of the URL,
      but the scheme can also be a legal one. In that case please let
      me know so that it can be added to Checkbot.

=item --enable-virtual

This option enables dealing with virtual servers. Checkbot then
assumes that all hostnames for internal servers are unique, even
though their IP addresses may be the same. Normally Checkbot uses the
IP address to distinguish servers. This has the advantage that if a
server has two names (e.g. www and bamboozle) its pages only get
checked once. When you want to check multiple virtual servers this
causes problems, which this feature works around by using the hostname
to distinguish the server.

=item --language

The argument for this option is a two-letter language code. Checkbot
will use language negotiation to request files in that language. The
default is to request English language (language code 'en').

=item --suppress

The argument for this option is a file which contains combinations of
error codes and URLs for which to suppress warnings. This can be used
to avoid reporting of known and unfixable URL errors or warnings.

The format of the suppression file is a simple whitespace delimited
format, first listing the error code followed by the URL. Each error
code and URL combination is listed on a new line. Comments can be
added to the file by starting the line with a C<#> character.

  # 301 Moved Permanently
  301   http://www.w3.org/P3P
  
  # 403 Forbidden
  403   http://www.herring.com/

For further flexibility a regular expression can be used instead of a
normal URL. The regular expression must be enclosed with forward
slashes. For example, to suppress all 403 errors on wikipedia:

  403   /http:\/\/wikipedia.org\/.*/

=back

Deprecated options which will disappear in a future release:

=over

=item --allow-simple-hosts (deprecated)

This option turns off warnings about URLs which contain unqualified
host names. This is useful for intranet sites which often use just a
simple host name or even C<localhost> in their links.

Use of this option is deprecated. Please use the --dontwarn mechanism
for error 902 instead.

=back


=head1 HINTS AND TIPS

=over

=item Problems with checking FTP links

Some users may experience consistent problems with checking FTP
links. In these cases it may be useful to instruct Net::FTP to use
passive FTP mode to check files. This can be done by setting the
environment variable FTP_PASSIVE to 1. For example, using the bash
shell: C<FTP_PASSIVE=1 checkbot ...>. See the Net::FTP documentation
for more details.

=item Run-away Checkbot

In some cases Checkbot literally takes forever to finish. There are two
common causes for this problem.

First, there might be a database application as part of the web site
which generates a new page based on links on another page. Since
Checkbot tries to travel through all links this will create an
infinite number of pages. This kind of run-away effect is usually predictable. It can be avoided by using the --exclude option.

Second, a server configuration problem can cause a loop in generating
URLs for pages that really do not exist. This will result in URLs of
the form http://some.server/images/images/images/logo.png, with ever
more 'images' included. Checkbot cannot check for this because the
server should have indicated that the requested pages do not
exist. There is no easy way to solve this other than fixing the
offending web server or the broken links.

=item Problems with https:// links

The error message

  Can't locate object method "new" via package "LWP::Protocol::https::Socket"

usually means that the current installation of LWP does not support
checking of SSL links (i.e. links starting with https://). This
problem can be solved by installing the Crypt::SSLeay module.

=back

=head1 EXAMPLES

The most simple use of Checkbot is to check a set of pages on a
server. To check my checkbot pages I would use:

    checkbot http://degraaff.org/checkbot/

Checkbot runs can take some time so Checkbot can send a notification
mail when the run is done:

    checkbot --mailto hans@degraaff.org http://degraaff.org/checkbot/

It is possible to check a set of local file without using a web
server. This only works for static files but may be useful in some
cases.

    checkbot file:///var/www/documents/

=head1 PREREQUISITES

This script uses the C<LWP> modules.

=head1 COREQUISITES

This script can send mail when C<Mail::Send> is present.

=head1 AUTHOR

Hans de Graaff <hans@degraaff.org>

=pod OSNAMES

any

=cut

# Declare some global variables, avoids ugly use of main:: all around
my %checkbot_errors = ('901' => 'Host name expected but not found',
		       '902' => 'Unqualified host name in URL',
		       '903' => 'URL contains double slash in URL',
		       '904' => 'Unknown scheme in URL',
		      );

my @starturls = ();

# Two hashes to store the response to a URL, and all the parents of the URL
my %url_error = ();
my %url_parent = ();

# Hash for storing the title of a URL for use in reports. TODO: remove
# this and store title as part of queue.
my %url_title = ();

# Hash for suppressions, which are defined as a combination of code and URL
my %suppression = ();

# Hash to store statistics on link checking
my %stats = ('todo' => 0,
	     'link' => 0,
	     'problem' => 0 );

# Options hash (to be filled by GetOptions)
my %options = ();

# Keep track of start time so that we can use it in reports
my $start_time = time();

# If on a Mac we should ask for the arguments through some MacPerl stuff
if ($^O eq 'MacOS') {
  $main::mac_answer = eval "MacPerl::Ask('Enter Command-Line Options')";
  push(@ARGV, split(' ', $main::mac_answer));
}

# Prepare
check_options();
init_modules();
init_globals();
init_suppression();

# Start actual application
check_links();

# Finish up
create_page(1);
send_mail() if defined $main::opt_mailto and $stats{problem} > 0;

exit 0;

# output prints stuff on stderr if --verbose, and takes care of proper
# indentation
sub output {
  my ($line, $level) = @_;

  return unless $main::opt_verbose;

  chomp $line;

  my $indent = '';

  if (defined $level) {
    while ($level-- > 0) {
    $indent .= '    ';
    }
  }

  print STDERR $indent, $line, "\n";
}

### Initialization and setup routines

sub check_options {

  # Get command-line arguments
  use Getopt::Long;
  my $result = GetOptions(qw(cookies debug help noproxy=s verbose url=s match=s exclude|x=s file=s filter=s style=s ignore|z=s mailto|M=s note|N=s proxy=s internal-only sleep=f timeout=i interval=i dontwarn=s enable-virtual language=s allow-simple-hosts suppress=s));

  # Handle arguments, some are mandatory, some have defaults
  &print_help if (($main::opt_help && $main::opt_help)
                  || (!$main::opt_url && $#ARGV == -1));
  $main::opt_timeout = 120 unless defined($main::opt_timeout) && length($main::opt_timeout);
  $main::opt_verbose = 0 unless $main::opt_verbose;
  $main::opt_sleep = 0 unless defined($main::opt_sleep) && length($main::opt_sleep);
  $main::opt_interval = 10800 unless defined $main::opt_interval and length $main::opt_interval;
  $main::opt_dontwarn = "xxx" unless defined $main::opt_dontwarn and length $main::opt_dontwarn;
  $main::opt_enable_virtual = 0 unless defined $main::opt_enable_virtual;
  # Set the default language and make sure it is a two letter, lowercase code
  $main::opt_language = 'en' unless defined $main::opt_language;
  $main::opt_language = lc(substr($main::opt_language, 0, 2));
  $main::opt_language =~ tr/a-z//cd;
  if ($main::opt_language !~ /[a-z][a-z]/) {
    warn "Argument --language $main::opt_language is not a valid language code\nUsing English as a default.\n";
    $main::opt_language = 'en';
  }
  $main::opt_allow_simple_hosts = 0
	  unless $main::opt_allow_simple_hosts;
  output "--allow-simple-hosts is deprecated, please use the --dontwarn mechanism", 0 if $main::opt_allow_simple_hosts;

  # The default for opt_match will be set later, because we might want
  # to muck with opt_url first.

  # Display messages about the options
  output "*** Starting Checkbot $VERSION in verbose mode";
  output 'Will skip checking of external links', 1
    if $main::opt_internal_only;
  output "Allowing unqualified host names", 1
    if $main::opt_allow_simple_hosts;
  output "Not using optional Time::Duration module: not found", 1
	unless $main::useduration;
}

sub init_modules {

  use URI;
  # Prepare the user agent to be used:
  use LWP::UserAgent;
  use LWP::MediaTypes;
  #use LWP::Debug qw(- +debug);
  use HTML::LinkExtor;
  $main::ua = new LWP::UserAgent;
  $main::ua->agent("Checkbot/$VERSION LWP/" . LWP::Version);
  $main::ua->timeout($main::opt_timeout);
  # Add a proxy to the user agent, if defined
  $main::ua->proxy(['http', 'ftp'], $main::opt_proxy)
    if defined($main::opt_proxy);
  $main::ua->no_proxy(split(',', $main::opt_noproxy))
    if defined $main::opt_noproxy;
  # Add a cookie jar to the UA if requested by the user
  $main::ua->cookie_jar( {} )
    if defined $main::opt_cookies or $main::opt_cookies;

  require Mail::Send if defined $main::opt_mailto;

  use HTTP::Status;
}

sub init_globals {
  my $url;

  # Directory and files for output
  if ($main::opt_file) {
    $main::file = $main::opt_file;
    $main::file =~ /(.*)\./;
    $main::server_prefix = $1;
  } else {
    $main::file = "checkbot.html";
    $main::server_prefix = "checkbot";
  }
  $main::tmpdir = ($ENV{'TMPDIR'} or $ENV{'TMP'} or $ENV{'TEMP'} or "/tmp") . "/Checkbot.$$";

  $main::cur_queue  = $main::tmpdir . "/queue";
  $main::new_queue  = $main::tmpdir . "/queue-new";

  # Make sure we catch signals so that we can clean up temporary files
  $SIG{'INT'} = $SIG{'TERM'} = $SIG{'HUP'} = $SIG{'QUIT'} = \&got_signal;

  # Set up hashes to be used
  %main::checked = ();
  %main::servers = ();
  %main::servers_get_only = ();

  # Initialize the start URLs. --url takes precedence. Otherwise
  # just process URLs in order as they appear on the command line.
  unshift(@ARGV, $main::opt_url) if $main::opt_url;
  foreach (@ARGV) {
    $url = URI->new($_);
    # If no scheme is defined we will assume file is used, so that
    # it becomes easy to check a single file.
    $url->scheme('file') unless defined $url->scheme;
    $url->host('localhost') if $url->scheme eq 'file';
    if (!defined $url->host) {
      warn "No host specified in URL $url, ignoring it.\n";
      next;
    }
    push(@starturls, $url);
  }
  die "There are no valid starting URLs to begin checking with!\n"
    if scalar(@starturls) == -1;

  # Set the automatic matching expression to a concatenation of the starturls
  if (!defined $main::opt_match) {
    my @matchurls;
    foreach my $url (@starturls) {
      # Remove trailing files from the match, e.g. remove index.html
      # stuff so that we match on the host and/or directory instead,
      # but only if there is a path component in the first place.
      my $matchurl = $url->as_string;
      $matchurl =~ s!/[^/]+$!/! unless $url->path eq '';
      push(@matchurls, quotemeta $matchurl);
    }
    $main::opt_match = '^(' . join('|', @matchurls) . ')';
    output "--match defaults to $main::opt_match";
  }

  # Initialize statistics hash with number of start URLs
  $stats{'todo'} = scalar(@starturls);

  # We write out our status every now and then.
  $main::cp_int = 1;
  $main::cp_last = 0;
}

sub init_suppression {
  return if not defined $main::opt_suppress;

  die "Suppression file \"$main::opt_suppress\" is in fact a directory"
	if -d $main::opt_suppress;

  open(SUPPRESSIONS, $main::opt_suppress)
    or die "Unable to open $main::opt_suppress for reading: $!\n";
  while (my $line = <SUPPRESSIONS>) {
    chomp $line;
    next if $line =~ /^#/ or $line =~ /^\s*$/;

    if ($line !~ /^\s*(\d+)\s+(\S+)/) {
      output "WARNING: Unable to parse line in suppression file $main::opt_suppress:\n    $line\n";
    } else {
      output "Suppressed: $1 $2\n" if $main::opt_verbose;
      $suppression{$1}{$2} = $2;
    }
  }
  close SUPPRESSIONS;
}




### Main application code

sub check_links {
  my $line;

  mkdir $main::tmpdir, 0755
    || die "$0: unable to create directory $main::tmpdir: $!\n";

  # Explicitly set the record separator. I had the problem that this
  # was not defined under my perl 5.00502. This should fix that, and
  # not cause problems for older versions of perl.
  $/ = "\n";

  open(CURRENT, ">$main::cur_queue")
    || die "$0: Unable to open CURRENT $main::cur_queue for writing: $!\n";
  open(QUEUE, ">$main::new_queue")
    || die "$0: Unable to open QUEUE $main::new_queue for writing: $!\n";

  # Prepare CURRENT queue with starting URLs
  foreach (@starturls) {
    print CURRENT $_->as_string . "|\n";
  }
  close CURRENT;

  open(CURRENT, $main::cur_queue)
    || die "$0: Unable to open CURRENT $main::cur_queue for reading: $!\n";

  do {
    # Read a line from the queue, and process it
    while (defined ($line = <CURRENT>) ) {
      chomp($line);
      &handle_url($line);
      &check_point();
    }

    # Move queues around, and try again, but only if there are still
    # things to do
    output "*** Moving queues around, " . $stats{'todo'} . " links to do.";
    close CURRENT
      or warn "Error while closing CURRENT filehandle: $!\n";
    close QUEUE;

    # TODO: should check whether these succeed
    unlink($main::cur_queue);
    rename($main::new_queue, $main::cur_queue);

    open(CURRENT, "$main::cur_queue") 
      || die "$0: Unable to open $main::cur_queue for reading: $!\n";
    open(QUEUE, ">$main::new_queue") 
      || die "$0: Unable to open $main::new_queue for writing: $!\n";

  } while (not -z $main::cur_queue);

  close CURRENT;
  close QUEUE;

  unless (defined($main::opt_debug)) {
    clean_up();
  }
}

sub clean_up {
  unlink $main::cur_queue, $main::new_queue;
  rmdir $main::tmpdir;
  output "Removed temporary directory $main::tmpdir and its contents.\n", 1;
}

sub got_signal {
  my ($signalname) = @_;

  clean_up() unless defined $main::opt_debug;

  print STDERR "Caught SIG$signalname.\n";
  exit 1;
}

# Whether URL is 'internal' or 'external'
sub is_internal ($) {
  my ($url) = @_;

  return ( $url =~ /$main::opt_match/o
	   and not (defined $main::opt_exclude and $url =~ /$main::opt_exclude/o));
}


sub handle_url {
  my ($line) = @_;
  my ($urlstr, $urlparent) = split(/\|/, $line);

  my $reqtype;
  my $response;
  my $type;

  $stats{'todo'}--;

  # Add this URL to the ones we've seen already, return if it is a
  # duplicate.
  return if add_checked($urlstr);

  $stats{'link'}++;

  # Is this an external URL and we only check internal stuff?
  return if defined $main::opt_internal_only
    and not is_internal($urlstr);

  my $url = URI->new($urlstr);

  # Perhaps this is a URL we are not interested in checking...
  if (not defined($url->scheme) 
      or $url->scheme !~ /^(https?|file|ftp|gopher|nntp)$/o ) {
    # Ignore URLs which we know we can ignore, create error for others
    if ($url->scheme =~ /^(news|mailto|javascript|mms)$/o) {
      output "Ignore $url", 1;
    } else {
      add_error($urlstr, $urlparent, 904, "Unknown scheme in URL: "
				. $url->scheme);
    }
    return;
  }

  # Guess/determine the type of document we might retrieve from this
  # URL. We do this because we only want to use a full GET for HTML
  # document. No need to retrieve images, etc.
  if ($url->path =~ /\/$/o || $url->path eq "") {
    $type = 'text/html';
  } else {
    $type = guess_media_type($url->path);
  }
  # application/octet-stream is the fallback of LWP's guess stuff, so
  # if we get this then we ask the server what we got just to be sure.
  if ($type eq 'application/octet-stream') {
    $response = performRequest('HEAD', $url, $urlparent, $type, $main::opt_language);
    $type = $response->content_type;
  }

  # Determine if this is a URL we should GET fully or partially (using HEAD)
  if ($type =~ /html/o
      && $url->scheme =~ /^(https?|file|ftp|gopher)$/o
      and is_internal($url->as_string)
      && (!defined $main::opt_exclude || $url !~ /$main::opt_exclude/o)) {
    $reqtype = 'GET';
  } else {
    $reqtype = 'HEAD';
  }

  # Get the document, unless we already did while determining the type
  $response = performRequest($reqtype, $url, $urlparent, $type, $main::opt_language)
    unless defined($response) and $reqtype eq 'HEAD';

  # Ok, we got something back from checking, let's see what it is
  if ($response->is_success) {
    select(undef, undef, undef, $main::opt_sleep)
      unless $main::opt_debug || $url->scheme eq 'file';

    # Internal HTML documents need to be given to handle_doc for processing
	if ($reqtype eq 'GET' and is_internal($url->as_string)) {
	  handle_doc($response, $urlstr);
	}
  } else {

    # Right, so it wasn't the smashing succes we hoped for, so bring
    # the bad news and store the pertinent information for later
    add_error($url, $urlparent, $response->code, $response->message);

    if ($response->is_redirect and is_internal($url->as_string)) {
      if ($response->code == 300) {  # multiple choices, but no redirection available
	output 'Multiple choices', 2;
      } else {
	my $baseURI = URI->new($url);
	if (defined $response->header('Location')) {
	  my $redir_url = URI->new_abs($response->header('Location'), $baseURI);
	  output "Redirected to $redir_url", 2;
	  add_to_queue($redir_url, $urlparent);
	  $stats{'todo'}++;
	} else {
	  output 'Location header missing from redirect response', 2;
	}
      }
    }
  }
  # Done with this URL
}

sub performRequest {
  my ($reqtype, $url, $urlparent, $type, $language) = @_;

  my ($response);

  # A better solution here would be to use GET exclusively. Here is how
  # to do that. We would have to set this max_size thing in
  # check_external, I guess...
  # Set $ua->max_size(1) and then try a normal GET request. However,
  # that doesn't always work as evidenced by an FTP server that just
  # hangs in this case... Needs more testing to see if the timeout
  # catches this.

  # Normally, we would only need to do a HEAD, but given the way LWP
  # handles gopher requests, we need to do a GET on those to get at
  # least a 500 and 501 error. We would need to parse the document
  # returned by LWP to find out if we had problems finding the
  # file. -- Patch by Bruce Speyer <bspeyer@texas-one.org>

  # We also need to do GET instead of HEAD if we know the remote
  # server won't accept it.  The standard way for an HTTP server to
  # indicate this is by returning a 405 ("Method Not Allowed") or 501
  # ("Not Implemented").  Other circumstances may also require sending
  # GETs instead of HEADs to a server.  Details are documented below.
  # -- Larry Gilbert <larry@n2h2.com>

  # Normally we try a HEAD request first, then a GET request if
  # needed. There may be circumstances in which we skip doing a HEAD
  # (e.g. when we should be getting the whole document).
  foreach my $try ('HEAD', 'GET') {

    # Skip trying HEAD when we know we need to do a GET or when we
    # know only a GET will work anyway.
    next if $try eq 'HEAD' and
      ($reqtype eq 'GET'
       or $url->scheme eq 'gopher'
       or (defined $url->authority and $main::servers_get_only{$url->authority}));

    # Output what we are going to do with this link
    output(sprintf("%4s %s (%s)\n", $try, $url, $type), 1);

    # Create the request with all appropriate headers
    my %header_hash = ( 'Referer' => $urlparent );
    if (defined($language) && ($language ne '')) {
      $header_hash{'Accept-Language'} = $language;
    }
    my $ref_header = new HTTP::Headers(%header_hash);
    my $request = new HTTP::Request($try, $url, $ref_header);
    $response = $main::ua->simple_request($request);

    # If we are doing a HEAD request we need to make sure nothing
    # fishy happened. we use some heuristics to see if we are ok, or
    # if we should try again with a GET request.
    if ($try eq 'HEAD') {

      # 400, 405, 406 and 501 are standard indications that HEAD
      # shouldn't be used
	  # We used to check for 403 here also, but according to the HTTP spec
      # a 403 indicates that the server understood us fine but really does
	  # not want us to see the page, so we SHOULD NOT retry.
      if ($response->code =~ /^(400|405|406|501)$/o) {
		output "Server does not seem to like HEAD requests; retrying", 2;
		$main::servers_get_only{$url->authority}++;
		next;
      };

	  # There are many servers out there that have real trouble with
	  # HEAD, so if we get a 500 Internal Server error just retry with
	  # a GET request to get an authoritive answer. We used to do this
	  # only for special cases, but the list got big and some
	  # combinations (e.g. Zope server behind Apache proxy) can't
	  # easily be detected from the headers.
	  if ($response->code =~ /^500$/o) {
		output "Internal server error on HEAD request; retrying with GET", 2;
		$main::servers_get_only{$url->authority}++ if defined $url->authority;
		next;
	  }

      # If we know the server we can try some specific heuristics
      if (defined $response->server) {

		# Netscape Enterprise has been seen returning 500 and even 404
		# (yes, 404!!) in response to HEAD requests
		if ($response->server =~ /^Netscape-Enterprise/o
			and $response->code =~ /^404$/o) {
		  output "Unreliable Netscape-Enterprise response to HEAD request; retrying", 2;
		  $main::servers_get_only{$url->authority}++;
		  next;
		};
	  }

      # If a HEAD request resulted in nothing noteworthy, no need for
      # any further attempts using GET, we are done.
      last;
    }
  }

  return $response;
}


# This routine creates a (temporary) WWW page based on the current
# findings This allows somebody to monitor the process, but is also
# convenient when this program crashes or waits because of diskspace
# or memory problems

sub create_page {
    my($final_page) = @_;

    my $path = "";
    my $prevpath = "";
    my $prevcode = 0;
    my $prevmessage = "";

    output "*** Start writing results page";

    open(OUT, ">$main::file.new") 
	|| die "$0: Unable to open $main::file.new for writing:\n";
    print OUT "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n";
    print OUT "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n";
    print OUT "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n";
    print OUT "<head>\n";
    if (!$final_page) {
      printf OUT "<meta http-equiv=\"Refresh\" content=\"%d\" />\n",
      int($main::cp_int * 60 / 2 - 5);
    }

    print OUT "<title>Checkbot report</title>\n";
    print OUT "<link rel=\"stylesheet\" type=\"text/css\" href=\"$main::opt_style\" />\n" if defined $main::opt_style;
    print OUT "</head>\n";
    print OUT "<body>\n";
    print OUT "<h1><em>Checkbot</em>: main report</h1>\n";

    # Show the status of this checkbot session
    print OUT "<table summary=\"Status of this Checkbot session\" class='status'><tr><th>Status:</th><td>";
    if ($final_page) {
      print OUT "Done.<br />\n";
      print OUT 'Run started on ' . localtime($start_time) . ".<br />\n";
      print OUT 'Run duration ', duration(time() - $start_time), ".\n"
	if $main::useduration;
    } else {
      print OUT "Running since " . localtime($start_time) . ".<br />\n";
      print OUT "Last update at ". localtime() . ".<br />\n";
      print OUT "Next update in <strong>", int($main::cp_int), "</strong> minutes.\n";
    }
    print OUT "</td></tr></table>\n\n";

    # Summary (very brief overview of key statistics)
    print OUT "<hr /><h2 class='summary'>Report summary</h2>\n";

    print OUT "<table summary=\"Report summary\" class='summary'>\n";
    print OUT "<tr id='checked'><th>Links checked</th><td class='value'>", $stats{'link'}, "</td></tr>\n";
    print OUT "<tr id='problems'><th>Problems so far</th><td class='value'>", $stats{'problem'}, "</td></tr>\n";
    print OUT "<tr id='todo'><th>Links to do</th><td class='value'>", $stats{'todo'}, "</td></tr>\n";
    print OUT "</table>\n";

    # Server information
    printAllServers($final_page);

    # Checkbot session parameters
    print OUT "<hr /><h2 class='params'>Checkbot session parameters</h2>\n";
    print OUT "<table summary=\"Checkbot session parameters\" class='params'>\n";
    print OUT "<tr><th align=\"left\">--url &amp;<br/> &lt;command line urls&gt;</th><td class='text'>Start URL(s)</td><td class='value' id='url'>",
              join(',', @starturls), "</td></tr>\n";
    print OUT "<tr><th align=\"left\">--match</th><td class='text'>Match regular expression</td><td class='value' id='match'>$main::opt_match</td></tr>\n";
    print OUT "<tr><th align=\"left\">--exclude</th><td class='text'>Exclude regular expression</td><td class='value' id='exclude'>$main::opt_exclude</td></tr>\n" if defined $main::opt_exclude;
    print OUT "<tr><th align=\"left\">--filter</th><td class='text'>Filter regular expression</td><td class='value' id='filter'>$main::opt_filter</td></tr>\n" if defined $main::opt_filter;
    print OUT "<tr><th align=\"left\">--noproxy</th><td class='text'>No Proxy for the following domains</td><td class='value' id='noproxy'>$main::opt_noproxy</td></tr>\n" if defined $main::opt_noproxy;
    print OUT "<tr><th align=\"left\">--ignore</th><td class='text'>Ignore regular expression</td><td class='value' id='ignore'>$main::opt_ignore</td></tr>\n" if defined $main::opt_ignore;
    print OUT "<tr><th align=\"left\">--suppress</th><td class='text'>Suppress error code and URL specified by</td><td class='value' id='suppress'>$main::opt_suppress</td></tr>\n" if defined $main::opt_suppress;
    print OUT "<tr><th align=\"left\">--dontwarn</th><td class='text'>Don't warn for these codes</td><td class='value' id='dontwarn'>$main::opt_dontwarn</td></tr>\n" if $main::opt_dontwarn ne 'xxx';
    print OUT "<tr><th align=\"left\">--enable-virtual</th><td class='text'>Use virtual names only</td><td class='value' id='enable_virtual'>yes</td></tr>\n" if $main::opt_enable_virtual;
    print OUT "<tr><th align=\"left\">--internal-only</th><td class='text'>Check only internal links</td><td class='value' id='internal_only'>yes</td></tr>\n" if defined $main::opt_internal_only;
    print OUT "<tr><th align=\"left\">--cookies</th><td class='text'>Accept cookies</td><td class='value' id='cookies'>yes</td></tr>\n" if defined $main::opt_cookies;
    print OUT "<tr><th align=\"left\">--sleep</th><td class='text'>Sleep seconds between requests</td><td class='value' id='sleep'>$main::opt_sleep</td></tr>\n" if ($main::opt_sleep != 0);
    print OUT "<tr><th align=\"left\">--timeout</th><td class='text'>Request timeout seconds</td><td class='value' id='timeout'>$main::opt_timeout</td></tr>\n";
    print OUT "</table>\n";

    # Statistics for types of links

    print OUT signature();

    close(OUT);

    rename($main::file, $main::file . ".bak");
    rename($main::file . ".new", $main::file);

    unlink $main::file . ".bak" unless $main::opt_debug;

    output "*** Done writing result page";
}

# Create a list of all the servers, and create the corresponding table
# and subpages. We use the servers overview for this. This can result
# in strange effects when the same server (e.g. IP address) has
# several names, because several entries will appear. However, when
# using the IP address there are also a number of tricky situations,
# e.g. with virtual hosting. Given that likely the servers have
# different names for a reasons, I think it is better to have
# duplicate entries in some cases, instead of working off of the IP
# addresses.

sub printAllServers {
  my ($finalPage) = @_;

  my $server;
  print OUT "<hr /><h2 class='overview'>Overview per server</h2>\n";
  print OUT "<table summary=\"Overview per server\" class='overview'><tr><th>Server</th><th>Server<br />Type</th><th>Documents<br />scanned</th><th>Problem<br />links</th><th>Ratio</th></tr>\n";

  foreach $server (sort keys %main::servers) {
    print_server($server, $finalPage);
  }
  print OUT "</table>\n\n";
}

sub get_server_type {
  my($server) = @_;

  my $result;

  if ( ! defined($main::server_type{$server})) {
    if ($server eq 'localhost') {
      $result = 'Direct access through filesystem';
    } else {
      my $request = new HTTP::Request('HEAD', "http://$server/");
      my $response = $main::ua->simple_request($request);
      $result = $response->header('Server');
    }
    $result = "Unknown server type" if ! defined $result or $result eq "";
    output "=== Server $server is a $result";
    $main::server_type{$server} = $result;
  }
  $main::server_type{$server};
}

sub add_checked {
  my($urlstr) = @_;
  my $item;
  my $result = 0;

  if (is_internal($urlstr) and not $main::opt_enable_virtual) {
    # Substitute hostname with IP-address. This keeps us from checking
    # the same pages for each name of the server, wasting time & resources.
    # Only do this if we are not dealing with virtual servers. Also, we
    # only do this for internal servers, because it makes no sense for
    # external links.
    my $url = URI->new($urlstr);
    $url->host(ip_address($url->host)) if $url->can('host');
    $urlstr = $url->as_string;
  }

  if (defined $main::checked{$urlstr}) {
    $result = 1;
    $main::checked{$urlstr}++;
  } else {
    $main::checked{$urlstr} = 1;
  }

  return $result;
}

# Has this URL already been checked?
sub is_checked {
  my ($urlstr) = @_;

  if (is_internal($urlstr) and not $main::opt_enable_virtual) {
    # Substitute hostname with IP-address. This keeps us from checking
    # the same pages for each name of the server, wasting time & resources.
    # Only do this if we are not dealing with virtual servers. Also, we
    # only do this for internal servers, because it makes no sense for
    # external links.
    my $url = URI->new($urlstr);
    $url->host(ip_address($url->host)) if $url->can('host');
    $urlstr = $url->as_string;
  }

  return defined $main::checked{$urlstr};
}

sub add_error ($$$$) {
  my ($url, $urlparent, $code, $status) = @_;

  # Check for the quick eliminations first
  return if $code =~ /$main::opt_dontwarn/o
    or defined $suppression{$code}{$url};

  # Check for matches on the regular expressions in the supression file
  if (defined $suppression{$code}) {
	foreach my $item ( %{$suppression{$code}} ) {
	  if ($item =~ /^\/(.*)\/$/) {
		my $regexp = $1;
		if ($url =~ $regexp) {
		  output "Supressing error $code for $url due to regular expression match on $regexp", 2;
		  return;
		}
	  }
	}
  }

  $status = checkbot_status_message($code) if not defined $status;

  output "$code $status", 2;

  $url_error{$url}{'code'} = $code;
  $url_error{$url}{'status'} = $status;
  push @{$url_parent{$url}}, $urlparent;
  $stats{'problem'}++;
}

# Parse document, and get the links
sub handle_doc {
  my ($response, $urlstr) = @_;

  my $num_links = 0;
  my $new_links = 0;

  # TODO: we are making an assumption here that the $reponse->base is
  # valid, which might not always be true! This needs to be fixed, but
  # first let's try to find out why this stuff is sometimes not
  # valid... Aha. a simple <base href="news:"> will do the trick. It is
  # not clear what the right fix for this is.

  # We use the URL we used to retrieve this document as the URL to
  # attach the problem reports to, even though this may not be the
  # proper base url.
  my $baseurl = URI->new($urlstr);

  # When we received the document we can add a notch to its server
  $main::servers{$baseurl->authority}++;

  # Retrieve useful information from this document.
  # TODO: using a regexp is NOT how this should be done, but it is
  # easy. The right way would be to write a HTML::Parser or to use
  # XPath on the document DOM provided that the document is easily
  # parsed as XML. Either method is a lot of overhead.
  if ($response->content =~ /title\>(.*?)\<\/title/si) {

	# TODO: using a general hash that stores titles for all pages may
	# consume too much memory. It would be better to only store the
	# titles for requests that had problems. That requires passing them
	# down to the queue. Take the easy way out for now.
	$url_title{$baseurl} = $1;
  }

  # Check if this document has a Robots META tag. If so, check if
  # Checkbot is allowed to FOLLOW the links on this page. Note that we
  # ignore the INDEX directive because Checkbot is not an indexing
  # robot. See http://www.robotstxt.org/wc/meta-user.html
  # TODO: one more reason (see title) to properly parse this document...
  if ($response->content =~ /\<meta[^\>]*?robots[^\>]*?nofollow[^\>]*?\>/si) {
	output "Obeying robots meta tag $&, skipping document", 2;
	return;
  }


  # Parse the document just downloaded, using the base url as defined
  # in the response, otherwise we won't get the same behavior as
  # browsers and miss things like a BASE url in pages.
  my $p = HTML::LinkExtor->new(undef, $response->base);

  # If charset information is missing then decoded_content doesn't
  # work. Fall back to content in this case, even though that may lead
  # to charset warnings. See bug 1665075 for reference.
  my $content = $response->decoded_content || $response->content;
  $p->parse($content);
  $p->eof;

  # Deal with the links we found in this document
  my @links = $p->links();
  foreach (@links) {
    my ($tag, %l) = @{$_};
    foreach (keys %l) {
      # Get the canonical URL, so we don't need to worry about base, case, etc.
      my $url = $l{$_}->canonical;

      # Remove fragments, if any
      $url->fragment(undef);

      # Determine in which tag this URL was found
      # Ignore <base> tags because they need not point to a valid URL
      # in order to work (e.g. when directory indexing is turned off).
      next if $tag eq 'base';

	  # Skip some 'links' that are not required to link to an actual
	  # live link but which LinkExtor returns as links anyway.
	  next if $tag eq 'applet' and $_ eq 'code';
	  next if $tag eq 'object' and $_ eq 'classid';

      # Run filter on the URL if defined
      if (defined $main::opt_filter) {
	die "Filter supplied with --filter option contains errors!\n$@\n"
	  unless defined eval '$url =~ s' . $main::opt_filter
      }

      # Should we ignore this URL?
      if (defined $main::opt_ignore and $url =~ /$main::opt_ignore/o) {
	output "--ignore: $url", 1;
	next;
      }

      # Check whether URL has fully-qualified hostname
      if ($url->can('host') and $url->scheme ne 'news') {
        if (! defined $url->host) {
		  add_error($url, $baseurl->as_string, '901',
					$checkbot_errors{'901'});
        } elsif (!$main::opt_allow_simple_hosts && $url->host !~ /\./) {
		  add_error($url, $baseurl->as_string, '902',
					$checkbot_errors{'902'});
        }
      }

      # Some servers do not process // correctly in requests for relative
      # URLs. We should flag them here. Note that // in a URL path is
      # actually valid per RFC 2396, and that they should not be removed
      # when processing relative URLs as per RFC 1808. See
      # e.g. <http://deesse.univ-lemans.fr:8003/Connected/RFC/1808/18.html>.
      # Thanks to Randal Schwartz and Reinier Post for their explanations.
      if ($url =~ /^http:\/\/.*\/\//) {
		add_error($url, $baseurl->as_string, '903',
				  $checkbot_errors{'903'});
      }

      # We add all URLs found to the queue, unless we already checked
      # it earlier
      if (is_checked($url)) {

		# If an error has already been logged for this URL we add the
		# current parent to the list of parents on which this URL
		# appears.
		if (defined $url_error{$url}) {
		  push @{$url_parent{$url}}, $baseurl->as_string;
		  $stats{'problem'}++;
		}
	
		$stats{'link'}++;
      } else {
		add_to_queue($url, $baseurl);
		$stats{'todo'}++;
		$new_links++;
      }
      $num_links++;
    }
  }
  output "Got $num_links links ($new_links new) from document", 2;
}


sub add_to_queue {
  my ($url, $parent) = @_;

  print QUEUE $url . '|' . $parent . "\n";
}

sub checkbot_status_message ($) {
  my ($code) = @_;

  my $result = status_message($code) || $checkbot_errors{$code}
    || '(Undefined status)';
}

sub print_server ($$) {
  my($server, $final_page) = @_;

  my $host = $server;
  $host =~ s/(.*):\d+/$1/;

  output "Writing server $server (really " . ip_address($host) . ")", 1;

  my $server_problem = count_problems($server);
  my $filename = "$main::server_prefix-$server.html";
  $filename =~ s/:/-/o;

  print OUT "<tr><td class='server'>";
  print OUT "<a href=\"@{[ (fileparse($filename))[0] ]}\">" if $server_problem > 0;
  print OUT "$server";
  print OUT "</a>" if $server_problem > 0;
  print OUT "</td>";
  print OUT "<td class='servertype'>" . get_server_type($server) . "</td>";
  printf OUT "<td class='unique' align=\"right\">%d</td>",
  $main::servers{$server} + $server_problem;
  if ($server_problem) {
    printf OUT "<td class='problems' id='oops' align=\"right\">%d</td>",
    $server_problem;
  } else {
    printf OUT "<td class='problems' id='zero_defects' align=\"right\">%d</td>",
    $server_problem;
  }

  my $ratio = $server_problem / ($main::servers{$server} + $server_problem) * 100;
  print OUT "<td class='ratio' align=\"right\">";
  print OUT "<strong>" unless $ratio < 0.5;
  printf OUT "%4d%%", $ratio;
  print OUT "</strong>" unless $ratio < 0.5;
  print OUT "</td>";
  print OUT "</tr>\n";

  # Create this server file
  open(SERVER, ">$filename")
    || die "Unable to open server file $filename for writing: $!";
  print SERVER "<?xml version=\"1.0\" encoding=\"ISO-8859-1\"?>\n";
  print SERVER "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\" \"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">\n";
  print SERVER "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">\n";
  print SERVER "<head>\n";
  if (!$final_page) {
    printf SERVER "<meta http-equiv=\"Refresh\" content=\"%d\" />\n",
    int($main::cp_int * 60 / 2 - 5);
  }
  print SERVER "<link rel=\"stylesheet\" type=\"text/css\" href=\"$main::opt_style\" />\n" if defined $main::opt_style;
  print SERVER "<title>Checkbot: output for server $server</title></head>\n";
  print SERVER "<body><h2><em>Checkbot</em>: report for server <tt>$server</tt></h2>\n";
  print SERVER "<p>Go To: <a href=\"@{[ (fileparse($main::file))[0] ]}\">Main report page</a>";

  printServerProblems($server, $final_page);

  print SERVER "\n";
  print SERVER signature();

  close SERVER;
}

# Return a string containing Checkbot's signature for HTML pages
sub signature {
  return "<hr />\n<p class='signature'>Page created by <a href=\"http://degraaff.org/checkbot/\">Checkbot $VERSION</a> on <em>" . localtime() . "</em>.</p>\n".
    "<p><a href=\"http://validator.w3.org/check/?uri=referer\"><img src=\"http://www.w3.org/Icons/valid-xhtml11\" alt=\"Valid XHTML 1.1\" height=\"31\" width=\"88\" /></a></p>".
    "</body></html>";
}

# Loop through all possible problems, select relevant ones for this server
# and display them in a meaningful way.
sub printServerProblems ($$) {
  my ($server, $final_page) = @_;
  $server = quotemeta $server;

  my $separator = "<hr />\n";

  my %thisServerList = ();

  # First we find all the problems for this particular server
  foreach my $url (keys %url_parent) {
    foreach my $parent (@{$url_parent{$url}}) {
      next if $parent !~ $server;
      chomp $parent;
      $thisServerList{$url_error{$url}{'code'}}{$parent}{$url}
		= $url_error{$url}{'status'};
    }
  }

  # Do a run to find all error codes on this page, and include a table
  # of contents to the actual report
  foreach my $code (sort keys %thisServerList) {
    print SERVER ", <a href=\"#rc$code\">$code ";
    print SERVER checkbot_status_message($code);
    print SERVER "</a>";
  }
  print SERVER ".</p>\n";


  # Now run through this list and print the errors
  foreach my $code (sort keys %thisServerList) {
    my $codeOut = '';

    foreach my $parent (sort keys %{ $thisServerList{$code} }) {
      my $urlOut = '';
      foreach my $url (sort keys %{ $thisServerList{$code}{$parent} }) {
	my $status = $thisServerList{$code}{$parent}{$url};
	$urlOut .= "<li><a href=\"$url\">$url</a><br/>\n";
	$urlOut .= "$status"
	  if defined $status and $status ne checkbot_status_message($code);
	$urlOut .= "</li>\n";
      }
      if ($urlOut ne '') {
	$codeOut .= "<dt><a href=\"$parent\">$parent</a>";
	$codeOut .= "<br />$url_title{$parent}\n" if defined $url_title{$parent};
	$codeOut .= "<dd><ul>\n$urlOut\n</ul>\n\n";
      }
    }

    if ($codeOut ne '') {
      print SERVER $separator if $separator;
      $separator = '';
      print SERVER "<h4 id=\"rc$code\">$code ";
      print SERVER checkbot_status_message($code);
      print SERVER "</h4>\n<dl>\n$codeOut\n</dl>\n";
    }
  }
}

sub check_point {
  if ( ($main::cp_last + 60 * $main::cp_int < time()) 
	   || ($main::opt_debug && $main::opt_verbose)) {
	&create_page(0);
	$main::cp_last = time();
	# Increase the intervall from one snapshot to the next by 25%
	# until we have reached the maximum.
	$main::cp_int *= 1.25 unless $main::opt_debug;
	$main::cp_int = $main::opt_interval if $main::cp_int > $main::opt_interval;
  }
}

sub send_mail {
  my $msg = new Mail::Send;
  my $sub = 'Checkbot results for ';
  $sub .= join(', ', @starturls);
  $sub .= ': ' . $stats{'problem'} . ' errors';

  $msg->to($main::opt_mailto);
  $msg->subject($sub);

  my $fh = $msg->open;

  print $fh "Checkbot results for:\n  " . join("\n  ", @starturls) . "\n\n";
  print $fh "User-supplied note: $main::opt_note\n\n"
    if defined $main::opt_note;

  print $fh $stats{'link'}, " links were checked, and ";
  print $fh $stats{'problem'}, " problems were detected.\n";

  print $fh 'Run started on ' . localtime($start_time) . "\n";
  print $fh 'Run duration ', duration(time() - $start_time), "\n"
    if $main::useduration;


  print $fh "\n-- \nCheckbot $VERSION\n";
  print $fh "<URL:http://degraaff.org/checkbot/>\n";

  $fh->close;
}

sub print_help {
  print <<"__EOT__";
Checkbot $VERSION command line options:

  --cookies          Accept cookies from the server
  --debug            Debugging mode: No pauses, stop after 25 links.
  --file file        Use file as basis for output file names.
  --help             Provide this message.
  --mailto address   Mail brief synopsis to address when done.
  --noproxy domains  Do not proxy requests to given domains.
  --verbose          Verbose mode: display many messages about progress.
  --url url          Start URL
  --match match      Check pages only if URL matches `match'
                     If no match is given, the start URL is used as a match
  --exclude exclude  Exclude pages if the URL matches 'exclude'
  --filter regexp    Run regexp on each URL found
  --ignore ignore    Ignore URLs matching 'ignore'
  --suppress file    Use contents of 'file' to suppress errors in output
  --note note        Include Note (e.g. URL to report) along with Mail message.
  --proxy URL        URL of proxy server for HTTP and FTP requests.
  --internal-only    Only check internal links, skip checking external links.
  --sleep seconds    Sleep this many seconds between requests (default 0)
  --style url        Reference the style sheet at this URL.
  --timeout seconds  Timeout for http requests in seconds (default 120)
  --interval seconds Maximum time interval between updates (default 10800)
  --dontwarn codes   Do not write warnings for these HTTP response codes
  --enable-virtual   Use only virtual names, not IP numbers for servers
  --language         Specify 2-letter language code for language negotiation

Options --match, --exclude, and --ignore can take a perl regular expression
as their argument\n
Use 'perldoc checkbot' for more verbose documentation.
Checkbot WWW page     : http://degraaff.org/checkbot/
Mail bugs and problems: checkbot\@degraaff.org
__EOT__

  exit 0;
}

sub ip_address {
  my($host) = @_;

  return $main::ip_cache{$host} if defined $main::ip_cache{$host};

  my($name,$aliases,$adrtype,$length,@addrs) = gethostbyname($host);
  if (defined $addrs[0]) {
    my($n1,$n2,$n3,$n4) = unpack ('C4',$addrs[0]);
    $main::ip_cache{$host} = "$n1.$n2.$n3.$n4";
  } else {
    # Whee! No IP-address found for this host. Just keep whatever we
    # got for the host. If this really is some kind of error it will
    # be found later on.
    $main::ip_cache{$host} = $host;
   }
}

sub count_problems {
  my ($server) = @_;
  $server = quotemeta $server;
  my $count = 0;

  foreach my $url (sort keys %url_parent) {
    foreach my $parent (@{ $url_parent{$url} }) {
	$count++ if $parent =~ m/$server/;
    }
  }
  return $count;
}