#!/usr/bin/perl

# This script gets package names from a web-page or ftp listing or whatever,
# and sorts them in order by their version number

use strict; use warnings;
use Sort::Versions;
use File::Basename;
use HTML::Entities;
use URI;

my $sf_mirror = "http://optusnet.dl.sourceforge.net/sourceforge/";

my $prog = basename($0);

# get input from a url
my ($page_url) = (@ARGV);
# NOTE please include a trailing slash on the page_url, if it is a directory.
# fixme we're not noticing redirects or <base> tags yet.
if (!defined $page_url) {
	die "usage: $prog url-of-package-list\n";
}
my $get_command;
if (`which curl` ne "") {
	$get_command = "curl -s --location -- \Q$page_url\E";
} else {
	$get_command = "wget -q -O- -- \Q$page_url\E";
}
my $page = `$get_command`;
if ($page eq "") {
	die "$prog: failed: wget\n";
}

my @package_exts = qw(.tbz2 .tar.bz2 .tgz .tar.gz .zip .tar); #  .deb .udeb .rpm);
  # in order of preference
my $package_exts_rx = join("|", map {"\Q$_\E"} @package_exts);
$package_exts_rx = "(?:$package_exts_rx)";
$package_exts_rx = qr/$package_exts_rx/;

my @packages;
my $versions = {};
my $exts = {};
my $urls = {};
my %already;

my @urls;
if ($page =~ m{<A[^>]* href\s*=}si) {
	# this is html with at least one anchor in it.  So just check hrefs
	my $new_page = "";
	while ($page =~ m{<A[^>]* href\s*=\s*["']?([^ '">]*)}gsi) {
		push @urls, url_decode($1);
	}
} else {
	# look for packages in the text
	while ($page =~ m{(([-+a-z0-9_.:/]+/)?([-+a-z0-9_.]+)($package_exts_rx))(?:[^-+a-z0-9_.]|$)}gios) {
		push @urls, $1;
	}
}

#print "$_\n" for @urls;
#print "\n";

for my $url (@urls) {
	$url =~ m{^([-+a-z0-9_.:/]+/)?([-+a-z0-9_.]+)($package_exts_rx)([?#].*|$)}ios or next;
	my ($path, $package__version, $ext, $url_extra) = ($1, $2, $3, $4);
#	print "$package__version$ext\n";
	my ($package, $version) = $package__version =~ m{^(.*?)([-_][0-9].*)?$};
	$package = $package__version if !defined $package;
	$version = "" if !defined $version;
	$path = "" if !defined $path;
#	print "  $package $version $ext\n";
	next if $already{"$package\t$version\t$ext"}++;
	push @packages, $package unless $already{$package}++;
	push @{$versions->{$package}}, $version unless $already{"$package\t$version"}++;
	$exts->{$package}{$version}{$ext} = 1;
	my $uri = URI->new($url);
	my $absolute_url = $uri->abs($page_url)->as_string;
	$absolute_url = rewrite_url($absolute_url);
	push @{$urls->{$package}{$version}{$ext}}, $absolute_url unless $already{"$package\t$version\t$ext\t$absolute_url"}++;
}

#print "$_\n" for @packages;
#print "\n";



for my $package (@packages) {
	my $versions = $versions->{$package};
	@$versions = sort { versioncmp($a, $b) } @$versions;
	
#	print "$package\n";
#	for my $version (@$versions) {
#		print "\t$version\n";
#	}
}

for my $package (@packages) {
	my $versions = $versions->{$package};
	my $version = $versions->[-1];
#	print "$package$version\n";
	for my $ext (@package_exts) {
		if ($exts->{$package}{$version}{$ext}) {
			my $urls = $urls->{$package}{$version}{$ext};
			my $url = $urls->[0];	

			print "$package$version\t$url\n";

#			for my $url (@{$urls->{$package}{$version}{$ext}}) {
#				print "\t$url\n";
#			}
			last;
		}
	}
}

exit;

sub url_decode {
	my ($url) = @_;
	$url = decode_entities($url);
	$url =~ s/\+/ /sg;
	$url =~ s/\%([A-Fa-f0-9]{2})/pack('C', hex($1))/seg;
	return $url;
}

sub rewrite_url {
	my ($url) = @_;
	for ($url) {
		s{^http://prdownloads.(?:sourceforge|sf).net/}{$sf_mirror} and s/\?.*//;
	}
	return $url;
}
