matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy as sp
import scipy.spatial.distance as distance
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.cluster.hierarchy import fcluster

print("# np version = ",np.__version__)
print("# sp version = ",sp.__version__)
print("# pd version = ",pd.__version__)

# np version =  1.16.4
# sp version =  1.3.0
# pd version =  0.24.2

# parameters for files 
staOUT = "TEST"
netOUT = "AL"
comOUT = "HHZ"
locOUT2 = ""

# used for cross-correlation analysis
tw = "10.24"
freqOUT = "8-24"
max_evdist = "3"

# minimum coherency
min_coh = "9500"

# outout file; cluster_fi
#ALL.AL.HHZ..allCSs.10.24.xc.8-24Hz.3.max_dist.log.tt.coh.9500.cluster
cluster_fi = staOUT+"."+netOUT+"."+comOUT+"."+locOUT2+".allCSs."+tw+".xc."+freqOUT+"Hz."+max_evdist+".max_dist.log.tt.coh."+min_coh+".linkage_cluster.out" 
print(cluster_fi)

TEST.AL.HHZ..allCSs.10.24.xc.8-24Hz.3.max_dist.log.tt.coh.9500.linkage_cluster.out

# min_dist = 1- coherency. 
# this example find repeating events pairs if distance is less than 0.05
min_dist = 1.0 -int(min_coh)/10000
print("# min_dist = ",min_dist)

# min_dist =  0.050000000000000044

# uniq event id. this needs to create "distant" materix 
evid_list_fi = "http://ncedc.org/ftp/outgoing/taira/HFRC/test.evid."+min_coh+".list" # uniq evid file from coh0.95 file
print(evid_list_fi)

http://ncedc.org/ftp/outgoing/taira/HFRC/test.evid.9500.list

# read event list file and check
# should be 7 ids
evid_list = pd.read_csv(evid_list_fi,   
                       sep=" ",names=["evid"],header=None)
#print(evid_list.head())
print(evid_list)

       evid
0  21130389
1  21187940
2  21410540
3  51194429
4  71565055
5  71588265
6  72590021

# how many events?
n = len(evid_list['evid'])
print(n)

7

# make distance matrix dMatrix
#dMatrix = np.zeros([n, n])
dMatrix = np.ones([n,n]) # all of them have dist=1
np.fill_diagonal(dMatrix, 0)
print(dMatrix)

[[0. 1. 1. 1. 1. 1. 1.]
 [1. 0. 1. 1. 1. 1. 1.]
 [1. 1. 0. 1. 1. 1. 1.]
 [1. 1. 1. 0. 1. 1. 1.]
 [1. 1. 1. 1. 0. 1. 1.]
 [1. 1. 1. 1. 1. 0. 1.]
 [1. 1. 1. 1. 1. 1. 0.]]

# coh_fi. coherency files
coh_fi = "http://ncedc.org/ftp/outgoing/taira/HFRC/"+staOUT+"."+netOUT+"."+comOUT+"."+locOUT2+".allCSs."+tw+".xc."+freqOUT+"Hz."+max_evdist+".max_dist.log.tt.coh."+min_coh+".catalog"
print(coh_fi)

http://ncedc.org/ftp/outgoing/taira/HFRC/TEST.AL.HHZ..allCSs.10.24.xc.8-24Hz.3.max_dist.log.tt.coh.9500.catalog

# read coherency file
#1984.069.223101 1995.236.113508 9652 9652 9715 30081294
#1984.069.223101 2001.100.100318 9714 9714 9715 21157672
#1984.070.085241 1994.205.004902 9805 9805 9788 30053446
# currently coh1 = coh2 
coh = pd.read_csv(coh_fi,   
                       sep=" ",names=["ms_name1", "ms_name2", "coh1", "coh2", "evid1", "evid2"],header=None)
print(coh)

          ms_name1         ms_name2  coh1  coh2     evid1     evid2
0  2000.292.230439  2011.172.174716  9708  9708  21130389  71588265
1  2008.012.020128  2011.172.174716  9631  9631  51194429  71588265
2  2001.221.021038  2004.290.072635  9858  9858  21187940  21410540
3  2001.221.021038  2011.125.090631  9788  9788  21187940  71565055
4  2004.290.072635  2011.125.090631  9685  9685  21410540  71565055
5  2004.290.072635  2016.038.220752  9564  9564  21410540  72590021
6  2011.125.090631  2016.038.220752  9746  9746  71565055  72590021

# how many coherency measurements we have?
num = len(coh['evid1'])
print(num)

7

# "insert" distance=(1-coherency) values at appropriate location in dMaterix.
# if we do not have the coherency value for pairs of events, distance=1 i.e., coherency = 0.0 
for i in range(num):
    evid1=coh['evid1'][i]
    evid2=coh['evid2'][i]
    evid2=coh['evid2'][i]
    dcc=1.0-(coh['coh1'][i]/10000.0)
    #print (evid1, evid2)
    evid1_index  = np.where(evid_list['evid'] == evid1 )[0].tolist()
    evid2_index  = np.where(evid_list['evid'] == evid2 )[0].tolist()
    #print(evid1_index[0], evid2_index[0], dcc)
    dMatrix[evid1_index,evid2_index]=dcc
    dMatrix[evid2_index,evid1_index]=dcc

# check where we have distance less than 1. i.e., we have coherency values for these pairs
#np.where(dMatrix < 1 )[0].tolist()
print(list(zip(*np.where(dMatrix < 1))))

[(0, 0), (0, 5), (1, 1), (1, 2), (1, 4), (2, 1), (2, 2), (2, 4), (2, 6), (3, 3), (3, 5), (4, 1), (4, 2), (4, 4), (4, 6), (5, 0), (5, 3), (5, 5), (6, 2), (6, 4), (6, 6)]

#  checking
dMatrix

array([[0.    , 1.    , 1.    , 1.    , 1.    , 0.0292, 1.    ],
       [1.    , 0.    , 0.0142, 1.    , 0.0212, 1.    , 1.    ],
       [1.    , 0.0142, 0.    , 1.    , 0.0315, 1.    , 0.0436],
       [1.    , 1.    , 1.    , 0.    , 1.    , 0.0369, 1.    ],
       [1.    , 0.0212, 0.0315, 1.    , 0.    , 1.    , 0.0254],
       [0.0292, 1.    , 1.    , 0.0369, 1.    , 0.    , 1.    ],
       [1.    , 1.    , 0.0436, 1.    , 0.0254, 1.    , 0.    ]])

# making dArray. this needs for linkage
dArray = distance.squareform(dMatrix)

# do clustering. we use single; that aloows chain reaction. alll pairs should be a member of a sequence
#z = linkage(dArray, method = 'average')
#z = linkage(dArray, method = 'centroid')
z = linkage(dArray, method = 'single') # allow chain reaction

# checking z matrix
z

array([[ 1.    ,  2.    ,  0.0142,  2.    ],
       [ 4.    ,  7.    ,  0.0212,  3.    ],
       [ 6.    ,  8.    ,  0.0254,  4.    ],
       [ 0.    ,  5.    ,  0.0292,  2.    ],
       [ 3.    , 10.    ,  0.0369,  3.    ],
       [ 9.    , 11.    ,  1.    ,  7.    ]])

# another checking
z[0]

array([1.    , 2.    , 0.0142, 2.    ])

# make dendrogram; 
# should show two sequences
dendrogram(z)

{'icoord': [[25.0, 25.0, 35.0, 35.0],
  [15.0, 15.0, 30.0, 30.0],
  [5.0, 5.0, 22.5, 22.5],
  [55.0, 55.0, 65.0, 65.0],
  [45.0, 45.0, 60.0, 60.0],
  [13.75, 13.75, 52.5, 52.5]],
 'dcoord': [[0.0, 0.01419999999999999, 0.01419999999999999, 0.0],
  [0.0, 0.021199999999999997, 0.021199999999999997, 0.01419999999999999],
  [0.0, 0.025399999999999978, 0.025399999999999978, 0.021199999999999997],
  [0.0, 0.029200000000000004, 0.029200000000000004, 0.0],
  [0.0, 0.036900000000000044, 0.036900000000000044, 0.029200000000000004],
  [0.025399999999999978, 1.0, 1.0, 0.036900000000000044]],
 'ivl': ['6', '4', '1', '2', '3', '0', '5'],
 'leaves': [6, 4, 1, 2, 3, 0, 5],
 'color_list': ['g', 'g', 'g', 'r', 'r', 'b']}

# make sequences with min_dist threshold 
#cre_id = fcluster(z, t=0.05, criterion='distance') # 0.05 is included 0.95
cre_id = fcluster(z, t=min_dist, criterion='distance')

# check sequence id for each pair
# 0,3,5 should be seqid = "2" 
# 1,2,4,6 should be seqid = "1"
print(cre_id)

[2 1 1 2 1 2 1]

# how many sequences?
max_id = cre_id[np.argmax(cre_id)]
print(max_id)

2

# how many events in each sequence? if 1 or 0, something wrong
# "len" shows how many events we have for this sequence
for i in range(1, max_id+1):
    a=np.where(cre_id == i)
    #print(a)
    lenOUT=len(a[0])
    print("i = ", i, " lenOUT = ",lenOUT)
    if lenOUT <= 1:
        print("something wrong!")
        print("i = ", i, " lenOUT = ",lenOUT)

i =  1  lenOUT =  4
i =  2  lenOUT =  3

# which enid_index are sequence_id = 1?
cre_id_in = 1
print(np.where(cre_id == cre_id_in)[0])
# [0 1 2]
print(np.where(cre_id == cre_id_in)[0].tolist())
cre_index  = np.where(cre_id == cre_id_in)[0].tolist()

[1 2 4 6]
[1, 2, 4, 6]

# check ncss_event_id beasd on cre_index
for x, indexOUT in enumerate(cre_index):
        print(indexOUT)
        print(evid_list['evid'][indexOUT])

1
21187940
2
21410540
4
71565055
6
72590021

# prefix for this project
cre_prefix = "HFRC"

# making culster_file
#SEQID 9715 SUBNUM X HFRC0
#0 1984.069.223101 9652 9652 1984.069.223101 1995.236.113508 9715 30081294 HFRC0
#1 1984.069.223101 9714 9714 1984.069.223101 2001.100.100318 9715 21157672 HFRC0
#outfi = 'uw_model.ppmod'
#dgrid = 0.05 # 0.05 degress. matched to xgrid and ygrid
with open(cluster_fi , 'w') as f:
    for cre_index in range(1, max_id+1): # id starts from 1.
    #for cre_index in range(1, 10):
        pair_n = 0
        seqidOUT = cre_index - 1 # id starts from 1
        print("# seqidOUT = ",seqidOUT)
        print("# cre_index = ",cre_index)
        evid_index  = np.where(cre_id == cre_index)[0].tolist()
        print("# evid_index = ",evid_index)
        evid_index_num = len(evid_index)
        print("# evid_index_num = ",evid_index_num)

        for evid_n, indexOUT in enumerate(evid_index):
            print("# evid_n = ",evid_n," indexOUT = ",indexOUT) # index
            evidOUT = (evid_list['evid'][indexOUT])
            print("# evidOUT = ",evidOUT)

            if evid_n== 0: # header
                print("# header evid_n = ",evid_n)
                f.write('SEQID '+str(evidOUT)+" SUBNUN  "+str(evid_index_num)+" "+cre_prefix+str(seqidOUT)+ "\n")
            
            # loop for evid2. 
            #for evid_n2, indexOUT2 in enumerate(evid_index, evid_n+1):
            for evid_n2 in range( evid_n + 1 ,  len(evid_index) ):
                indexOUT2 = evid_index[evid_n2]
                print("# evid_n2 = ",evid_n2," indexOUT2 = ",indexOUT2) # index
                evidOUT2 = (evid_list['evid'][indexOUT2])
                print("# evidOUT2 = ",evidOUT2)
                coh_index = np.where( (coh['evid1'] == evidOUT)  & (coh['evid2'] == evidOUT2) |  (coh['evid2'] == evidOUT)  & (coh['evid1'] == evidOUT2) )[0].tolist()
                print("# coh_index =", coh_index)
                
                for coh_n, coh_indexOUT in enumerate(coh_index):
                    #print(coh_indexOUT)
                    #print(coh['evid1'][coh_indexOUT])
                    print(coh[coh_indexOUT:coh_indexOUT+1])
                    coh_select = coh[coh_indexOUT:coh_indexOUT+1]
                    #0 1984.069.223101 9652 9652 1984.069.223101 1995.236.113508 9715 30081294 HFRC0
                    #print(pair_n," ", (coh['ms_name1'][coh_indexOUT])," ", str(coh['coh1'][coh_indexOUT]), " ", str(coh['coh2'][coh_indexOUT]),
                    #      " ", (coh['ms_name1'][coh_indexOUT]), " ", (coh['ms_name2'][coh_indexOUT]), 
                    #      " ", (coh['evid1'][coh_indexOUT]), " ", (coh['evid2'][coh_indexOUT]), " ", cre_prefix+str(seqidOUT) )
                            
                    #f.write(str(pair_n)+" "+(coh['ms_name1'][coh_indexOUT]))
                    f.write(str(pair_n)+" "+(coh['ms_name1'][coh_indexOUT])+" "+str(coh['coh1'][coh_indexOUT])+" "+str(coh['coh2'][coh_indexOUT])   +" "+(coh['ms_name1'][coh_indexOUT])  +" "+(coh['ms_name2'][coh_indexOUT])+" "+str(coh['evid1'][coh_indexOUT]) +" "+str(coh['evid2'][coh_indexOUT])    +" "+cre_prefix+str(seqidOUT)  + "\n")
                            
                            #+" "+str(coh['coh2'][coh_indexOUT])+" "+(coh['ms_name1'][coh_indexOUT])+" "+(coh['ms_name2'][coh_indexOUT])+" "+(coh['evid1'][coh_indexOUT])+" "+(coh['evid2'][coh_indexOUT])+" "+cre_prefix+str(seqidOUT) )
                    pair_n = pair_n + 1

                    
        
    #f.write('UWBayArea' + "\n")
    #f.write(str(dgrid)+ "\n")
    #f.write('{:d} {:.2f} {:.2f}'.format(len(ygrid),ygrid[0],ygrid[-1])+ "\n")
    #f.write('{:d} {:.2f} {:.2f}'.format(len(xgrid),xgrid[0],xgrid[-1])+ "\n")
    #f.write('{:d} {:.2f} {:.2f}'.format(len(zgrid),zgrid[0],zgrid[-1])+ "\n")
    #f.write("-99 -99 -99 -99"+ "\n")
    #f.write(".TRUE."+ "\n")
#SEQNUM 1442
 
    f.write("SEQNUM "+ str(max_id) + "\n")
f.close()

# seqidOUT =  0
# cre_index =  1
# evid_index =  [1, 2, 4, 6]
# evid_index_num =  4
# evid_n =  0  indexOUT =  1
# evidOUT =  21187940
# header evid_n =  0
# evid_n2 =  1  indexOUT2 =  2
# evidOUT2 =  21410540
# coh_index = [2]
          ms_name1         ms_name2  coh1  coh2     evid1     evid2
2  2001.221.021038  2004.290.072635  9858  9858  21187940  21410540
# evid_n2 =  2  indexOUT2 =  4
# evidOUT2 =  71565055
# coh_index = [3]
          ms_name1         ms_name2  coh1  coh2     evid1     evid2
3  2001.221.021038  2011.125.090631  9788  9788  21187940  71565055
# evid_n2 =  3  indexOUT2 =  6
# evidOUT2 =  72590021
# coh_index = []
# evid_n =  1  indexOUT =  2
# evidOUT =  21410540
# evid_n2 =  2  indexOUT2 =  4
# evidOUT2 =  71565055
# coh_index = [4]
          ms_name1         ms_name2  coh1  coh2     evid1     evid2
4  2004.290.072635  2011.125.090631  9685  9685  21410540  71565055
# evid_n2 =  3  indexOUT2 =  6
# evidOUT2 =  72590021
# coh_index = [5]
          ms_name1         ms_name2  coh1  coh2     evid1     evid2
5  2004.290.072635  2016.038.220752  9564  9564  21410540  72590021
# evid_n =  2  indexOUT =  4
# evidOUT =  71565055
# evid_n2 =  3  indexOUT2 =  6
# evidOUT2 =  72590021
# coh_index = [6]
          ms_name1         ms_name2  coh1  coh2     evid1     evid2
6  2011.125.090631  2016.038.220752  9746  9746  71565055  72590021
# evid_n =  3  indexOUT =  6
# evidOUT =  72590021
# seqidOUT =  1
# cre_index =  2
# evid_index =  [0, 3, 5]
# evid_index_num =  3
# evid_n =  0  indexOUT =  0
# evidOUT =  21130389
# header evid_n =  0
# evid_n2 =  1  indexOUT2 =  3
# evidOUT2 =  51194429
# coh_index = []
# evid_n2 =  2  indexOUT2 =  5
# evidOUT2 =  71588265
# coh_index = [0]
          ms_name1         ms_name2  coh1  coh2     evid1     evid2
0  2000.292.230439  2011.172.174716  9708  9708  21130389  71588265
# evid_n =  1  indexOUT =  3
# evidOUT =  51194429
# evid_n2 =  2  indexOUT2 =  5
# evidOUT2 =  71588265
# coh_index = [1]
          ms_name1         ms_name2  coh1  coh2     evid1     evid2
1  2008.012.020128  2011.172.174716  9631  9631  51194429  71588265
# evid_n =  2  indexOUT =  5
# evidOUT =  71588265

# show results: two sequences: one has 4 events and other has 2 events

%%script tcsh
cat TEST.AL.HHZ..allCSs.10.24.xc.8-24Hz.3.max_dist.log.tt.coh.9500.linkage_cluster.out

SEQID 21187940 SUBNUN  4 HFRC0
0 2001.221.021038 9858 9858 2001.221.021038 2004.290.072635 21187940 21410540 HFRC0
1 2001.221.021038 9788 9788 2001.221.021038 2011.125.090631 21187940 71565055 HFRC0
2 2004.290.072635 9685 9685 2004.290.072635 2011.125.090631 21410540 71565055 HFRC0
3 2004.290.072635 9564 9564 2004.290.072635 2016.038.220752 21410540 72590021 HFRC0
4 2011.125.090631 9746 9746 2011.125.090631 2016.038.220752 71565055 72590021 HFRC0
SEQID 21130389 SUBNUN  3 HFRC1
0 2000.292.230439 9708 9708 2000.292.230439 2011.172.174716 21130389 71588265 HFRC1
1 2008.012.020128 9631 9631 2008.012.020128 2011.172.174716 51194429 71588265 HFRC1
SEQNUM 2