matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy as sp
import scipy.spatial.distance as distance
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.cluster.hierarchy import fcluster
print("# np version = ",np.__version__)
print("# sp version = ",sp.__version__)
print("# pd version = ",pd.__version__)
# parameters for files
staOUT = "TEST"
netOUT = "AL"
comOUT = "HHZ"
locOUT2 = ""
# used for cross-correlation analysis
tw = "10.24"
freqOUT = "8-24"
max_evdist = "3"
# minimum coherency
min_coh = "9500"
# outout file; cluster_fi
#ALL.AL.HHZ..allCSs.10.24.xc.8-24Hz.3.max_dist.log.tt.coh.9500.cluster
cluster_fi = staOUT+"."+netOUT+"."+comOUT+"."+locOUT2+".allCSs."+tw+".xc."+freqOUT+"Hz."+max_evdist+".max_dist.log.tt.coh."+min_coh+".linkage_cluster.out"
print(cluster_fi)
# min_dist = 1- coherency.
# this example find repeating events pairs if distance is less than 0.05
min_dist = 1.0 -int(min_coh)/10000
print("# min_dist = ",min_dist)
# uniq event id. this needs to create "distant" materix
evid_list_fi = "http://ncedc.org/ftp/outgoing/taira/HFRC/test.evid."+min_coh+".list" # uniq evid file from coh0.95 file
print(evid_list_fi)
# read event list file and check
# should be 7 ids
evid_list = pd.read_csv(evid_list_fi,
sep=" ",names=["evid"],header=None)
#print(evid_list.head())
print(evid_list)
# how many events?
n = len(evid_list['evid'])
print(n)
# make distance matrix dMatrix
#dMatrix = np.zeros([n, n])
dMatrix = np.ones([n,n]) # all of them have dist=1
np.fill_diagonal(dMatrix, 0)
print(dMatrix)
# coh_fi. coherency files
coh_fi = "http://ncedc.org/ftp/outgoing/taira/HFRC/"+staOUT+"."+netOUT+"."+comOUT+"."+locOUT2+".allCSs."+tw+".xc."+freqOUT+"Hz."+max_evdist+".max_dist.log.tt.coh."+min_coh+".catalog"
print(coh_fi)
# read coherency file
#1984.069.223101 1995.236.113508 9652 9652 9715 30081294
#1984.069.223101 2001.100.100318 9714 9714 9715 21157672
#1984.070.085241 1994.205.004902 9805 9805 9788 30053446
# currently coh1 = coh2
coh = pd.read_csv(coh_fi,
sep=" ",names=["ms_name1", "ms_name2", "coh1", "coh2", "evid1", "evid2"],header=None)
print(coh)
# how many coherency measurements we have?
num = len(coh['evid1'])
print(num)
# "insert" distance=(1-coherency) values at appropriate location in dMaterix.
# if we do not have the coherency value for pairs of events, distance=1 i.e., coherency = 0.0
for i in range(num):
evid1=coh['evid1'][i]
evid2=coh['evid2'][i]
evid2=coh['evid2'][i]
dcc=1.0-(coh['coh1'][i]/10000.0)
#print (evid1, evid2)
evid1_index = np.where(evid_list['evid'] == evid1 )[0].tolist()
evid2_index = np.where(evid_list['evid'] == evid2 )[0].tolist()
#print(evid1_index[0], evid2_index[0], dcc)
dMatrix[evid1_index,evid2_index]=dcc
dMatrix[evid2_index,evid1_index]=dcc
# check where we have distance less than 1. i.e., we have coherency values for these pairs
#np.where(dMatrix < 1 )[0].tolist()
print(list(zip(*np.where(dMatrix < 1))))
# checking
dMatrix
# making dArray. this needs for linkage
dArray = distance.squareform(dMatrix)
# do clustering. we use single; that aloows chain reaction. alll pairs should be a member of a sequence
#z = linkage(dArray, method = 'average')
#z = linkage(dArray, method = 'centroid')
z = linkage(dArray, method = 'single') # allow chain reaction
# checking z matrix
z
# another checking
z[0]
# make dendrogram;
# should show two sequences
dendrogram(z)
# make sequences with min_dist threshold
#cre_id = fcluster(z, t=0.05, criterion='distance') # 0.05 is included 0.95
cre_id = fcluster(z, t=min_dist, criterion='distance')
# check sequence id for each pair
# 0,3,5 should be seqid = "2"
# 1,2,4,6 should be seqid = "1"
print(cre_id)
# how many sequences?
max_id = cre_id[np.argmax(cre_id)]
print(max_id)
# how many events in each sequence? if 1 or 0, something wrong
# "len" shows how many events we have for this sequence
for i in range(1, max_id+1):
a=np.where(cre_id == i)
#print(a)
lenOUT=len(a[0])
print("i = ", i, " lenOUT = ",lenOUT)
if lenOUT <= 1:
print("something wrong!")
print("i = ", i, " lenOUT = ",lenOUT)
# which enid_index are sequence_id = 1?
cre_id_in = 1
print(np.where(cre_id == cre_id_in)[0])
# [0 1 2]
print(np.where(cre_id == cre_id_in)[0].tolist())
cre_index = np.where(cre_id == cre_id_in)[0].tolist()
# check ncss_event_id beasd on cre_index
for x, indexOUT in enumerate(cre_index):
print(indexOUT)
print(evid_list['evid'][indexOUT])
# prefix for this project
cre_prefix = "HFRC"
# making culster_file
#SEQID 9715 SUBNUM X HFRC0
#0 1984.069.223101 9652 9652 1984.069.223101 1995.236.113508 9715 30081294 HFRC0
#1 1984.069.223101 9714 9714 1984.069.223101 2001.100.100318 9715 21157672 HFRC0
#outfi = 'uw_model.ppmod'
#dgrid = 0.05 # 0.05 degress. matched to xgrid and ygrid
with open(cluster_fi , 'w') as f:
for cre_index in range(1, max_id+1): # id starts from 1.
#for cre_index in range(1, 10):
pair_n = 0
seqidOUT = cre_index - 1 # id starts from 1
print("# seqidOUT = ",seqidOUT)
print("# cre_index = ",cre_index)
evid_index = np.where(cre_id == cre_index)[0].tolist()
print("# evid_index = ",evid_index)
evid_index_num = len(evid_index)
print("# evid_index_num = ",evid_index_num)
for evid_n, indexOUT in enumerate(evid_index):
print("# evid_n = ",evid_n," indexOUT = ",indexOUT) # index
evidOUT = (evid_list['evid'][indexOUT])
print("# evidOUT = ",evidOUT)
if evid_n== 0: # header
print("# header evid_n = ",evid_n)
f.write('SEQID '+str(evidOUT)+" SUBNUN "+str(evid_index_num)+" "+cre_prefix+str(seqidOUT)+ "\n")
# loop for evid2.
#for evid_n2, indexOUT2 in enumerate(evid_index, evid_n+1):
for evid_n2 in range( evid_n + 1 , len(evid_index) ):
indexOUT2 = evid_index[evid_n2]
print("# evid_n2 = ",evid_n2," indexOUT2 = ",indexOUT2) # index
evidOUT2 = (evid_list['evid'][indexOUT2])
print("# evidOUT2 = ",evidOUT2)
coh_index = np.where( (coh['evid1'] == evidOUT) & (coh['evid2'] == evidOUT2) | (coh['evid2'] == evidOUT) & (coh['evid1'] == evidOUT2) )[0].tolist()
print("# coh_index =", coh_index)
for coh_n, coh_indexOUT in enumerate(coh_index):
#print(coh_indexOUT)
#print(coh['evid1'][coh_indexOUT])
print(coh[coh_indexOUT:coh_indexOUT+1])
coh_select = coh[coh_indexOUT:coh_indexOUT+1]
#0 1984.069.223101 9652 9652 1984.069.223101 1995.236.113508 9715 30081294 HFRC0
#print(pair_n," ", (coh['ms_name1'][coh_indexOUT])," ", str(coh['coh1'][coh_indexOUT]), " ", str(coh['coh2'][coh_indexOUT]),
# " ", (coh['ms_name1'][coh_indexOUT]), " ", (coh['ms_name2'][coh_indexOUT]),
# " ", (coh['evid1'][coh_indexOUT]), " ", (coh['evid2'][coh_indexOUT]), " ", cre_prefix+str(seqidOUT) )
#f.write(str(pair_n)+" "+(coh['ms_name1'][coh_indexOUT]))
f.write(str(pair_n)+" "+(coh['ms_name1'][coh_indexOUT])+" "+str(coh['coh1'][coh_indexOUT])+" "+str(coh['coh2'][coh_indexOUT]) +" "+(coh['ms_name1'][coh_indexOUT]) +" "+(coh['ms_name2'][coh_indexOUT])+" "+str(coh['evid1'][coh_indexOUT]) +" "+str(coh['evid2'][coh_indexOUT]) +" "+cre_prefix+str(seqidOUT) + "\n")
#+" "+str(coh['coh2'][coh_indexOUT])+" "+(coh['ms_name1'][coh_indexOUT])+" "+(coh['ms_name2'][coh_indexOUT])+" "+(coh['evid1'][coh_indexOUT])+" "+(coh['evid2'][coh_indexOUT])+" "+cre_prefix+str(seqidOUT) )
pair_n = pair_n + 1
#f.write('UWBayArea' + "\n")
#f.write(str(dgrid)+ "\n")
#f.write('{:d} {:.2f} {:.2f}'.format(len(ygrid),ygrid[0],ygrid[-1])+ "\n")
#f.write('{:d} {:.2f} {:.2f}'.format(len(xgrid),xgrid[0],xgrid[-1])+ "\n")
#f.write('{:d} {:.2f} {:.2f}'.format(len(zgrid),zgrid[0],zgrid[-1])+ "\n")
#f.write("-99 -99 -99 -99"+ "\n")
#f.write(".TRUE."+ "\n")
#SEQNUM 1442
f.write("SEQNUM "+ str(max_id) + "\n")
f.close()
# show results: two sequences: one has 4 events and other has 2 events
%%script tcsh
cat TEST.AL.HHZ..allCSs.10.24.xc.8-24Hz.3.max_dist.log.tt.coh.9500.linkage_cluster.out