#!/usr/bin/perl
#
# FetchEventData
#
# Fetch waveforms from pre-assembled event gathers.

# The default web service are from the NCEDC, other web services may be
# specified by setting the following environment variables:
#
# SERVICEBASE = the base URI of the service(s) to use (https://service.ncedc.org/)
# TIMESERIESWS = complete URI of service (https://service.ncedc.org/ncedcws/eventdata/1)
#
# This program is written to select and fetch waveform data
# from pre-assembled collection of waveforms for events.
#
# Dependencies: This script should run without problems on Perl
# release 5.10 or newer, older versions of Perl might require the
# installation of the following modules (and their dependencies):
#   Bundle::LWP (libwww-perl)
#
## Data selection
#
# Data is generally selected by specifying network, station, location,
# channel, quality, start time and end time.  The name parameters may
# contain wildcard characters.  All input options are optional but
# waveform requests should include a time window.  Data may be
# selected one of three ways:
#
# 1) Command line arguments: -N, -S, -L, -C, -Q, -s, -e
#
# 2) A BREQ_FAST formatted file, http://www.iris.edu/manuals/breq_fast.htm
#
# 3) A selection file containing a list of:
#    Net Sta Loc Chan Start End
#
# Example selection file contents:
# II BFO 00 BHZ 2011-01-01T00:00:00 2011-01-01T01:00:00
# IU ANMO 00 BHZ 2011-01-01T00:00:00 2011-01-01T01:00:00
# IU COLA 00 BHZ 2011-01-01T00:00:00 2011-01-01T01:00:00
#
# For the command line arguments and the selection file the network,
# station location and channel fields may contain the common * and ?
# wildcards, meaning zero-to-many and a single character respectively.
# These fields may also be comma-separated lists, for example, the
# network may be specified as II,IU,TA to select three networks.
#
## Data output
#
# miniSEED: waveform data will be requested based on the selection and all
# written to the single file.
#
# 2013.341
#  - Initial coding by Doug Neuhauser, UC Berkeley Seismological Laboratory,
#    based on FetchData by Chad Trabant, IRIS Data Management Center,

use strict;
use File::Basename;
use Getopt::Long;
use LWP::UserAgent;
use HTTP::Status qw(status_message);
use HTTP::Date;
use Time::HiRes;

my $version = "2013.341";

my $scriptname = basename($0);

# Default web service base
my $servicebase = 'https://service.ncedc.org';

# Check for environment variable overrides for servicebase
$servicebase = $ENV{'SERVICEBASE'} if ( exists $ENV{'SERVICEBASE'} );

# Web service for time series data
my $timeseriesservice = "$servicebase/ncedcws/eventdata/1";

# Check for environment variable override for timeseriesservice
$timeseriesservice = $ENV{'TIMESERIESWS'} if ( exists $ENV{'TIMESERIESWS'} );

my $useragent = "$scriptname/$version Perl/$] " . new LWP::UserAgent->_agent;

my $usage      = undef;
my $verbose    = undef;
my $nobsprint  = undef;

my $eventid    = undef;
my $catalog    = undef;
my $net        = undef;
my $sta        = undef;
my $loc        = undef;
my $chan       = undef;
my $qual       = "B";
my $starttime  = undef;
my $endtime    = undef;
my $selectfile = undef;
my $bfastfile  = undef;
my $mslopt     = undef;
my $lsoopt     = undef;
my $appname    = undef;
my $outfile    = undef;

my $auth       = undef;

# Parse command line arguments
Getopt::Long::Configure ("bundling_override");
my $getoptsret = GetOptions ( 'help|usage|h'   => \$usage,
                              'verbose|v+'     => \$verbose,
                              'nobs'           => \$nobsprint,
                              'eventid|E=s'    => \$eventid,
                              'catalog|c=s'    => \$catalog,
                              'net|N=s'        => \$net,
                              'sta|S=s'        => \$sta,
                              'loc|L=s'        => \$loc,
                              'chan|C=s'       => \$chan,
                              'qual|Q=s'       => \$qual,
			      'starttime|s=s'  => \$starttime,
			      'endtime|e=s'    => \$endtime,
			      'selectfile|l=s' => \$selectfile,
			      'bfastfile|b=s'  => \$bfastfile,
			      'msl=s'          => \$mslopt,
			      'lso'            => \$lsoopt,
			      'appname|A=s'    => \$appname,
			      'outfile|o=s'    => \$outfile,
			      'timeseriesws=s' => \$timeseriesservice,
			    );

my $required =  ( defined $eventid || defined $selectfile );

if ( ! $getoptsret || $usage || ! $required ) {
  print "$scriptname: collect time series and related metadata (version $version)\n";
#::   print "https://service.ncedc.org/clients/\n\n";
  print "Usage: $scriptname [options]\n\n";
  print " Options:\n";
  print " -v                Increase verbosity, may be specified multiple times\n";
  print " -E,--eventid      Eventid\n";
  print " -c,--catalog      catalog for eventid\n";
  print " -N,--net          Network code, list and wildcards (* and ?) accepted\n";
  print " -S,--sta          Station code, list and wildcards (* and ?) accepted\n";
  print " -L,--loc          Location ID, list and wildcards (* and ?) accepted\n";
  print " -C,--chan         Channel codes, list and wildcards (* and ?) accepted\n";
  print " -Q,--qual         Quality indicator, default is best\n";
  print " -s starttime      Specify start time (YYYY-MM-DD,HH:MM:SS.ssssss)\n";
  print " -e endtime        Specify end time (YYYY-MM-DD,HH:MM:SS.ssssss)\n";
  print " -l listfile       Read list of selections from file\n";
  print " -b bfastfile      Read list of selections from BREQ_FAST file\n";
  print " -msl length       Limit returned data to a minimum segment length\n";
  print " -lso              Limit returned data to the longest segment only\n";
  print " -A appname        Application/version string for identification\n";
  print "\n";
  print " -o outfile        Fetch time series data and write to output file\n";
  print "\n";
  exit 1;
}

if ( ! $outfile ) {
  die "No output options specified, try -h for usage information\n";
}

# Print script name and local time string
if ( $verbose ) {
  my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
  printf STDERR "$scriptname ($version) at %4d-%02d-%02d %02d:%02d:%02d\n", $year+1900, $mon+1, $mday, $hour, $min, $sec;
}

# Check for eventid.
if ( $outfile && ( ! defined $eventid ) ) {
  die "Cannot request event timeseries data without an eventid\n";
}

# Normalize time strings given on the command line
if ( $starttime ) {
  my ($year,$month,$mday,$hour,$min,$sec,$subsec) = split (/[-:,.\s\/T]/, $starttime);
  $starttime = sprintf ("%04d-%02d-%02dT%02d:%02d:%02d", $year, $month, $mday, $hour, $min, $sec);
  $starttime .= ".$subsec" if ( $subsec );
}

if ( $endtime ) {
  my ($year,$month,$mday,$hour,$min,$sec,$subsec) = split (/[-:,.\s\/T]/, $endtime);
  $endtime = sprintf ("%04d-%02d-%02dT%02d:%02d:%02d", $year, $month, $mday, $hour, $min, $sec);
  $endtime .= ".$subsec" if ( $subsec );
}

# An array to hold data selections
my @selections = ();

# Add command line selection to list
if ( defined $net || defined $sta || defined $loc || defined $chan ||
     defined $starttime || defined $endtime ) {
  push (@selections,"$net|$sta|$loc|$chan|$starttime|$endtime");
}

# Read selection list file
if ( $selectfile ) {
  print STDERR "Reading data selection from list file '$selectfile'\n";
  &ReadSelectFile ($selectfile);
}

# Read BREQ_FAST file
if ( $bfastfile ) {
  print STDERR "Reading data selection from BREQ_FAST file '$bfastfile'\n";
  &ReadBFastFile ($bfastfile);
}

# Report complete data selections
if ( $verbose > 2 ) {
  print STDERR "== Data selections ==\n";
  foreach my $select ( @selections ) {
    print STDERR "    $select\n";
  }
}

# An array to hold channel list and metadata
my %request = (); # Value is metadata range for selection

# Build request hash directly from selections
foreach my $selection ( @selections ) {
  my ($snet,$ssta,$sloc,$schan,$sstart,$send) = split (/\|/,$selection);

  # Subsitute non-specified fields with wildcards
  $snet = "*" if ( ! $snet );
  $ssta = "*" if ( ! $ssta );
  $sloc = "*" if ( ! $sloc );
  $schan = "*" if ( ! $schan );

  $request{"$snet|$ssta|$sloc|$schan|$sstart|$send"} = "$sstart|$send";
}

# Report complete data request
if ( $verbose > 2 ) {
  print STDERR "== Request list ==\n";
  foreach my $req ( sort keys %request ) {
    print STDERR "    $req (metadata: $request{$req})\n";
  }
}

# Track bytes downloaded in callback handlers
my $datasize = 0;

# Fetch time series data if output file specified
&FetchTimeSeriesData() if ( $outfile );

my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
printf STDERR "DONE at %4d-%02d-%02d %02d:%02d:%02d\n", $year+1900, $mon+1, $mday, $hour, $min, $sec;
## End of main


######################################################################
# ReadSelectFile:
#
# Read selection list file and add entries to the @selections array.
#
# Selection lines are expected to be in the following form:
#
# "Net Sta Loc Chan Start End"
#
# The Net, Sta, Loc and Channel fields are required and can be
# specified as wildcards.  Start and End are optional.
######################################################################
sub ReadSelectFile {
  my $selectfile = shift;

  open (SF, "<$selectfile") || die "Cannot open '$selectfile': $!\n";

  foreach my $line ( <SF> ) {
    chomp $line;
    next if ( $line =~ /^\#/ ); # Skip comment lines

    my ($net,$sta,$loc,$chan,$start,$end) = split (' ', $line);

    next if ( ! defined $chan );

    # Normalize time strings
    if ( $start ) {
      my ($year,$month,$mday,$hour,$min,$sec,$subsec) = split (/[-:,.\s\/T]/, $start);
      $start = sprintf ("%04d-%02d-%02dT%02d:%02d:%02d", $year, $month, $mday, $hour, $min, $sec);
      $start .= ".$subsec" if ( $subsec );
    }

    if ( $end ) {
      my ($year,$month,$mday,$hour,$min,$sec,$subsec) = split (/[-:,.\s\/T]/, $end);
      $end = sprintf ("%04d-%02d-%02dT%02d:%02d:%02d", $year, $month, $mday, $hour, $min, $sec);
      $end .= ".$subsec" if ( $subsec );
    }

    # Add selection to global list
    push (@selections,"$net|$sta|$loc|$chan|$start|$end");
  }

  close SF;
} # End of ReadSelectFile()


######################################################################
# ReadBFastFile:
#
# Read BREQ_FAST file and add entries to the @selections array.
#
######################################################################
sub ReadBFastFile {
  my $bfastfile = shift;

  open (BF, "<$bfastfile") || die "Cannot open '$bfastfile': $!\n";

  my $linecount = 0;
  BFLINE: foreach my $line ( <BF> ) {
    chomp $line;
    $linecount++;
    next if ( ! $line ); # Skip empty lines

    # Capture .QUALTIY header
    if ( $line =~ /^\.QUALITY .*$/ ) {
      ($qual) = $line =~ /^\.QUALITY ([DRQMBE])/;
      next;
    }

    next if ( $line =~ /^\./ ); # Skip other header lines

    my ($sta,$net,$syear,$smon,$sday,$shour,$smin,$ssec,$eyear,$emon,$eday,$ehour,$emin,$esec,$count,@chans) = split (' ', $line);

    # Simple validation of BREQ FAST fields
    if ( $sta !~ /^[A-Za-z0-9*?]{1,5}$/ ) {
      print "Unrecognized station code: '$sta', skipping line $linecount\n" if ( $verbose );
      next;
    }
    if ( $net !~ /^[-_A-Za-z0-9*?]+$/ ) {
      print "Unrecognized network code: '$net', skipping line $linecount\n" if ( $verbose );
      next;
    }
    if ( $syear !~ /^\d\d\d\d$/ ) {
      print "Unrecognized start year: '$syear', skipping line $linecount\n" if ( $verbose );
      next;
    }
    if ( $smon !~ /^\d{1,2}$/ ) {
      print "Unrecognized start month: '$smon', skipping line $linecount\n" if ( $verbose );
      next;
    }
    if ( $sday !~ /^\d{1,2}$/ ) {
      print "Unrecognized start day: '$sday', skipping line $linecount\n" if ( $verbose );
      next;
    }
    if ( $shour !~ /^\d{1,2}$/ ) {
      print "Unrecognized start hour: '$shour', skipping line $linecount\n" if ( $verbose );
      next;
    }
    if ( $smin !~ /^\d{1,2}$/ ) {
      print "Unrecognized start min: '$smin', skipping line $linecount\n" if ( $verbose );
      next;
    }
    if ( $ssec !~ /^\d{1,2}\.?\d{0,6}?$/ ) {
      print "Unrecognized start seconds: '$ssec', skipping line $linecount\n" if ( $verbose );
      next;
    }
    if ( $eyear !~ /^\d\d\d\d$/ ) {
      print "Unrecognized end year: '$eyear', skipping line $linecount\n" if ( $verbose );
      next;
    }
    if ( $emon !~ /^\d{1,2}$/ ) {
      print "Unrecognized end month: '$emon', skipping line $linecount\n" if ( $verbose );
      next;
    }
    if ( $eday !~ /^\d{1,2}$/ ) {
      print "Unrecognized end day: '$eday', skipping line $linecount\n" if ( $verbose );
      next;
    }
    if ( $ehour !~ /^\d{1,2}$/ ) {
      print "Unrecognized end hour: '$ehour', skipping line $linecount\n" if ( $verbose );
      next;
    }
    if ( $emin !~ /^\d{1,2}$/ ) {
      print "Unrecognized end min: '$emin', skipping line $linecount\n" if ( $verbose );
      next;
    }
    if ( $esec !~ /^\d{1,2}\.?\d{0,6}?$/ ) {
      print "Unrecognized end seconds: '$esec', skipping line $linecount\n" if ( $verbose );
      next;
    }
    if ( $count !~ /^\d+$/ || $count <= 0 ) {
      print "Invalid channel count field: '$count', skipping line $linecount\n" if ( $verbose );
      next;
    }
    if ( scalar @chans <= 0 ) {
      print "No channels specified, skipping line $linecount\n" if ( $verbose );
      next;
    }

    # Extract location ID if present, i.e. if channel count is one less than present
    my $loc = undef;
    $loc = pop @chans if ( scalar @chans == ($count+1) );

    if ( $loc && $loc !~ /^[A-Za-z0-9*?\-]{1,2}$/ ) {
      print "Unrecognized location ID: '$loc', skipping line $linecount\n" if ( $verbose );
      next;
    }

    foreach my $chan ( @chans ) {
      if ( $chan !~ /^[A-Za-z0-9*?]{3,3}$/ ) {
	print "Unrecognized channel codes: '$chan', skipping line $linecount\n" if ( $verbose );
	next BFLINE;
      }
    }

    if ( scalar @chans != $count ) {
      printf "Channel count field ($count) does not match number of channels specified (%d), skipping line $linecount\n",
	scalar @chans if ( $verbose );
      next;
    }

    # Normalize time strings
    my ($ssec,$ssub) = split (/\./, $ssec);
    my $start = sprintf ("%04d-%02d-%02dT%02d:%02d:%02d", $syear, $smon, $sday, $shour, $smin, $ssec);
    $start .= ".$ssub" if ( $ssub );
    my ($esec,$esub) = split (/\./, $esec);
    my $end = sprintf ("%04d-%02d-%02dT%02d:%02d:%02d", $eyear, $emon, $eday, $ehour, $emin, $esec);
    $end .= ".$esub" if ( $esub );

    # Add selection to global list for each channel
    foreach my $chan ( @chans ) {
      push (@selections,"$net|$sta|$loc|$chan|$start|$end");
    }
  }

  close BF;
} # End of ReadBFastFile()


######################################################################
# FetchTimeSeriesData:
#
# Collect time series data for each entry in the %request hash.  All
# returned data is written to the global output file (outfile).
#
# The request list is separatated into groups where the group size is
# defined in terms of station-days.  If the request for a group fails
 # it will be retried, after too many failures.
#
######################################################################
sub FetchTimeSeriesData {
  # Open output file
  open (OUT, ">$outfile") || die "Cannot open output file '$outfile': $!\n";

  # Create HTTP user agent
  my $ua = RequestAgent->new();
  $ua->env_proxy;

  my $count = 0;

  my @reqs;
  foreach my $req ( sort keys %request ) {
    my ($wnet,$wsta,$wloc,$wchan,$wstart,$wend) = split (/\|/, $req);
    push (@reqs, "$wnet $wsta $wloc $wchan $wstart $wend");
    ++$count;
  }

  print STDERR "Fetching time series data ($count selections)\n" if ( $verbose );
  my $ftime = Time::HiRes::time;
  my $totalbytes = 0;

  my $redocnt = 0;
  my $outoffset = 0;

  {
    # Create web service URI
    my $query = "query";
    my $uri = "${timeseriesservice}/$query";

    # Create POST data selection: specify options followed by selections
    my $postdata = "eventid=$eventid\n";
    $postdata .= "catalog=$catalog\n" if ( defined $catalog );
    $postdata .= "minimumlength=$mslopt\n" if ( defined $mslopt );
    $postdata .= "longestonly=true\n" if ( defined $lsoopt );
#::     $postdata .= "quality=$qual\n";

    foreach my $req ( @reqs ) {
      $postdata .= "$req\n";
    }

    print STDERR "Time series URI: '$uri'\n" if ( $verbose > 1 );
    print STDERR "Data selection (POST):\n$postdata" if ( $verbose > 1 );

    print STDERR "Downloading time series data :: " if ( $verbose );

    $datasize = 0;

    # Fetch time series data from web service using callback routine
    my $response = $ua->post($uri, Content => $postdata, ':content_cb' => \&DLCallBack );

    if ( $response->code == 204 ) {
      print (STDERR "No data available\n") if ( $verbose );
    }
    elsif ( $response->code == 401 ) {
      print (STDERR "AUTHORIZATION FAILED, username and password not recognized\n");
      last;
    }
    elsif ( ! $response->is_success() ) {
      print (STDERR "Error fetching data: "
	     . $response->code . " :: " . status_message($response->code) . "\n");
      print STDERR "------\n" . $response->decoded_content . "\n------\n";
      print STDERR "  URI: '$uri'\n" if ( $verbose > 1 );

      # For real output files rewind position to the end of the last group data
      seek (OUT, $outoffset, 0) if ( $outfile ne "-" );

      # Retry in 10 seconds or give up if already tried 60 times.
      if ( $response->code != 400 && $redocnt < 60 ) {
	print STDERR "Retrying request in 10 seconds\n";
	sleep 10;
	$redocnt++;
	goto REDOGROUP;
      }
      else {
	print STDERR "Too many retries, giving up.\n";
	last;
      }
    }
    else {
      printf (STDERR "%s\n", ($nobsprint)?sizestring($datasize):"") if ( $verbose );
    }

    $outoffset = tell (OUT);
    $totalbytes += $datasize;
  }

  close OUT;

  my $duration = Time::HiRes::time - $ftime;
  my $rate = $totalbytes/(($duration)?$duration:0.000001);
  printf (STDERR "Received %s of time series data for event %s in %.1f seconds (%s/s)\n",
	  sizestring($totalbytes), $eventid, $duration, sizestring($rate));

  # Remove empty file
  unlink $outfile if ( -z $outfile );
} # End of FetchTimeSeriesData


######################################################################
# DLCallBack:
#
# A call back for LWP downloading.
#
# Write received data to output file, tally up the received data size
# and print and updated (overwriting) byte count string.
######################################################################
sub DLCallBack {
  my ($data, $response, $protocol) = @_;
  print OUT $data;
  $datasize += length($data);

  if ( $verbose && ! $nobsprint ) {
    printf (STDERR "%-10.10s\b\b\b\b\b\b\b\b\b\b", sizestring($datasize));
  }
}


######################################################################
# sizestring (bytes):
#
# Return a clean size string for a given byte count.
######################################################################
sub sizestring { # sizestring (bytes)
  my $bytes = shift;

  if ( $bytes < 1000 ) {
    return sprintf "%d Bytes", $bytes;
  }
  elsif ( ($bytes / 1024) < 1000 ) {
    return sprintf "%.1f KB", $bytes / 1024;
  }
  elsif ( ($bytes / 1024 / 1024) < 1000 ) {
    return sprintf "%.1f MB", $bytes / 1024 / 1024;
  }
  elsif ( ($bytes / 1024 / 1024 / 1024) < 1000 ) {
    return sprintf "%.1f GB", $bytes / 1024 / 1024 / 1024;
  }
  elsif ( ($bytes / 1024 / 1024 / 1024 / 1024) < 1000 ) {
    return sprintf "%.1f TB", $bytes / 1024 / 1024 / 1024 / 1024;
  }
  else {
    return "";
  }
} # End of sizestring()


######################################################################
#
# Package RequestAgent: a superclass for LWP::UserAgent with override
# of LWP::UserAgent methods to set default user agent and handle
# authentication credentials.
#
######################################################################
BEGIN {
  use LWP;
  package RequestAgent;
  our @ISA = qw(LWP::UserAgent);

  sub new
    {
      my $self = LWP::UserAgent::new(@_);
      my $fulluseragent = $useragent;
      $fulluseragent .= " ($appname)" if ( $appname );
      $self->agent($fulluseragent);
      $self;
    }

  sub get_basic_credentials
    {
      my ($self, $realm, $uri) = @_;

      if ( defined $auth ) {
        return split(':', $auth, 2);
      }
      elsif (-t) {
        my $netloc = $uri->host_port;
        print "\n";
        print "Enter username for $realm at $netloc: ";
        my $user = <STDIN>;
        chomp($user);
        return (undef, undef) unless length $user;
        print "Password: ";
        system("stty -echo");
        my $password = <STDIN>;
        system("stty echo");
        print "\n";  # because we disabled echo
        chomp($password);
        return ($user, $password);
      }
      else {
        return (undef, undef)
      }
    }
} # End of LWP::UserAgent override