In [1]:
matplotlib inline
In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy as sp
import scipy.spatial.distance as distance
from scipy.cluster.hierarchy import linkage, dendrogram
from scipy.cluster.hierarchy import fcluster
In [3]:
print("# np version = ",np.__version__)
print("# sp version = ",sp.__version__)
print("# pd version = ",pd.__version__)
# np version =  1.16.4
# sp version =  1.3.0
# pd version =  0.24.2
In [4]:
# parameters for files 
staOUT = "TEST"
netOUT = "AL"
comOUT = "HHZ"
locOUT2 = ""

# used for cross-correlation analysis
tw = "10.24"
freqOUT = "8-24"
max_evdist = "3"

# minimum coherency
min_coh = "9500"
In [5]:
# outout file; cluster_fi
#ALL.AL.HHZ..allCSs.10.24.xc.8-24Hz.3.max_dist.log.tt.coh.9500.cluster
cluster_fi = staOUT+"."+netOUT+"."+comOUT+"."+locOUT2+".allCSs."+tw+".xc."+freqOUT+"Hz."+max_evdist+".max_dist.log.tt.coh."+min_coh+".linkage_cluster.out" 
print(cluster_fi)
TEST.AL.HHZ..allCSs.10.24.xc.8-24Hz.3.max_dist.log.tt.coh.9500.linkage_cluster.out
In [6]:
# min_dist = 1- coherency. 
# this example find repeating events pairs if distance is less than 0.05
min_dist = 1.0 -int(min_coh)/10000
print("# min_dist = ",min_dist)
# min_dist =  0.050000000000000044
In [7]:
# uniq event id. this needs to create "distant" materix 
evid_list_fi = "http://ncedc.org/ftp/outgoing/taira/HFRC/test.evid."+min_coh+".list" # uniq evid file from coh0.95 file
print(evid_list_fi)
http://ncedc.org/ftp/outgoing/taira/HFRC/test.evid.9500.list
In [8]:
# read event list file and check
# should be 7 ids
evid_list = pd.read_csv(evid_list_fi,   
                       sep=" ",names=["evid"],header=None)
#print(evid_list.head())
print(evid_list)
       evid
0  21130389
1  21187940
2  21410540
3  51194429
4  71565055
5  71588265
6  72590021
In [9]:
# how many events?
n = len(evid_list['evid'])
print(n)
7
In [10]:
# make distance matrix dMatrix
#dMatrix = np.zeros([n, n])
dMatrix = np.ones([n,n]) # all of them have dist=1
np.fill_diagonal(dMatrix, 0)
print(dMatrix)
[[0. 1. 1. 1. 1. 1. 1.]
 [1. 0. 1. 1. 1. 1. 1.]
 [1. 1. 0. 1. 1. 1. 1.]
 [1. 1. 1. 0. 1. 1. 1.]
 [1. 1. 1. 1. 0. 1. 1.]
 [1. 1. 1. 1. 1. 0. 1.]
 [1. 1. 1. 1. 1. 1. 0.]]
In [11]:
# coh_fi. coherency files
coh_fi = "http://ncedc.org/ftp/outgoing/taira/HFRC/"+staOUT+"."+netOUT+"."+comOUT+"."+locOUT2+".allCSs."+tw+".xc."+freqOUT+"Hz."+max_evdist+".max_dist.log.tt.coh."+min_coh+".catalog"
print(coh_fi)
http://ncedc.org/ftp/outgoing/taira/HFRC/TEST.AL.HHZ..allCSs.10.24.xc.8-24Hz.3.max_dist.log.tt.coh.9500.catalog
In [12]:
# read coherency file
#1984.069.223101 1995.236.113508 9652 9652 9715 30081294
#1984.069.223101 2001.100.100318 9714 9714 9715 21157672
#1984.070.085241 1994.205.004902 9805 9805 9788 30053446
# currently coh1 = coh2 
coh = pd.read_csv(coh_fi,   
                       sep=" ",names=["ms_name1", "ms_name2", "coh1", "coh2", "evid1", "evid2"],header=None)
print(coh)
          ms_name1         ms_name2  coh1  coh2     evid1     evid2
0  2000.292.230439  2011.172.174716  9708  9708  21130389  71588265
1  2008.012.020128  2011.172.174716  9631  9631  51194429  71588265
2  2001.221.021038  2004.290.072635  9858  9858  21187940  21410540
3  2001.221.021038  2011.125.090631  9788  9788  21187940  71565055
4  2004.290.072635  2011.125.090631  9685  9685  21410540  71565055
5  2004.290.072635  2016.038.220752  9564  9564  21410540  72590021
6  2011.125.090631  2016.038.220752  9746  9746  71565055  72590021
In [13]:
# how many coherency measurements we have?
num = len(coh['evid1'])
print(num)
7
In [14]:
# "insert" distance=(1-coherency) values at appropriate location in dMaterix.
# if we do not have the coherency value for pairs of events, distance=1 i.e., coherency = 0.0 
for i in range(num):
    evid1=coh['evid1'][i]
    evid2=coh['evid2'][i]
    evid2=coh['evid2'][i]
    dcc=1.0-(coh['coh1'][i]/10000.0)
    #print (evid1, evid2)
    evid1_index  = np.where(evid_list['evid'] == evid1 )[0].tolist()
    evid2_index  = np.where(evid_list['evid'] == evid2 )[0].tolist()
    #print(evid1_index[0], evid2_index[0], dcc)
    dMatrix[evid1_index,evid2_index]=dcc
    dMatrix[evid2_index,evid1_index]=dcc

    
    
    
In [15]:
# check where we have distance less than 1. i.e., we have coherency values for these pairs
#np.where(dMatrix < 1 )[0].tolist()
print(list(zip(*np.where(dMatrix < 1))))
[(0, 0), (0, 5), (1, 1), (1, 2), (1, 4), (2, 1), (2, 2), (2, 4), (2, 6), (3, 3), (3, 5), (4, 1), (4, 2), (4, 4), (4, 6), (5, 0), (5, 3), (5, 5), (6, 2), (6, 4), (6, 6)]
In [16]:
#  checking
dMatrix
Out[16]:
array([[0.    , 1.    , 1.    , 1.    , 1.    , 0.0292, 1.    ],
       [1.    , 0.    , 0.0142, 1.    , 0.0212, 1.    , 1.    ],
       [1.    , 0.0142, 0.    , 1.    , 0.0315, 1.    , 0.0436],
       [1.    , 1.    , 1.    , 0.    , 1.    , 0.0369, 1.    ],
       [1.    , 0.0212, 0.0315, 1.    , 0.    , 1.    , 0.0254],
       [0.0292, 1.    , 1.    , 0.0369, 1.    , 0.    , 1.    ],
       [1.    , 1.    , 0.0436, 1.    , 0.0254, 1.    , 0.    ]])
In [17]:
# making dArray. this needs for linkage
dArray = distance.squareform(dMatrix)
In [18]:
# do clustering. we use single; that aloows chain reaction. alll pairs should be a member of a sequence
#z = linkage(dArray, method = 'average')
#z = linkage(dArray, method = 'centroid')
z = linkage(dArray, method = 'single') # allow chain reaction
In [19]:
# checking z matrix
z
Out[19]:
array([[ 1.    ,  2.    ,  0.0142,  2.    ],
       [ 4.    ,  7.    ,  0.0212,  3.    ],
       [ 6.    ,  8.    ,  0.0254,  4.    ],
       [ 0.    ,  5.    ,  0.0292,  2.    ],
       [ 3.    , 10.    ,  0.0369,  3.    ],
       [ 9.    , 11.    ,  1.    ,  7.    ]])
In [20]:
# another checking
z[0]
Out[20]:
array([1.    , 2.    , 0.0142, 2.    ])
In [21]:
# make dendrogram; 
# should show two sequences
dendrogram(z)
Out[21]:
{'icoord': [[25.0, 25.0, 35.0, 35.0],
  [15.0, 15.0, 30.0, 30.0],
  [5.0, 5.0, 22.5, 22.5],
  [55.0, 55.0, 65.0, 65.0],
  [45.0, 45.0, 60.0, 60.0],
  [13.75, 13.75, 52.5, 52.5]],
 'dcoord': [[0.0, 0.01419999999999999, 0.01419999999999999, 0.0],
  [0.0, 0.021199999999999997, 0.021199999999999997, 0.01419999999999999],
  [0.0, 0.025399999999999978, 0.025399999999999978, 0.021199999999999997],
  [0.0, 0.029200000000000004, 0.029200000000000004, 0.0],
  [0.0, 0.036900000000000044, 0.036900000000000044, 0.029200000000000004],
  [0.025399999999999978, 1.0, 1.0, 0.036900000000000044]],
 'ivl': ['6', '4', '1', '2', '3', '0', '5'],
 'leaves': [6, 4, 1, 2, 3, 0, 5],
 'color_list': ['g', 'g', 'g', 'r', 'r', 'b']}
In [22]:
# make sequences with min_dist threshold 
#cre_id = fcluster(z, t=0.05, criterion='distance') # 0.05 is included 0.95
cre_id = fcluster(z, t=min_dist, criterion='distance') 
In [23]:
# check sequence id for each pair
# 0,3,5 should be seqid = "2" 
# 1,2,4,6 should be seqid = "1"
print(cre_id)
[2 1 1 2 1 2 1]
In [24]:
# how many sequences?
max_id = cre_id[np.argmax(cre_id)]
print(max_id)
2
In [25]:
# how many events in each sequence? if 1 or 0, something wrong
# "len" shows how many events we have for this sequence
for i in range(1, max_id+1):
    a=np.where(cre_id == i)
    #print(a)
    lenOUT=len(a[0])
    print("i = ", i, " lenOUT = ",lenOUT)
    if lenOUT <= 1:
        print("something wrong!")
        print("i = ", i, " lenOUT = ",lenOUT)
i =  1  lenOUT =  4
i =  2  lenOUT =  3
In [26]:
# which enid_index are sequence_id = 1?
cre_id_in = 1
print(np.where(cre_id == cre_id_in)[0])
# [0 1 2]
print(np.where(cre_id == cre_id_in)[0].tolist())
cre_index  = np.where(cre_id == cre_id_in)[0].tolist()
[1 2 4 6]
[1, 2, 4, 6]
In [27]:
# check ncss_event_id beasd on cre_index
for x, indexOUT in enumerate(cre_index):
        print(indexOUT)
        print(evid_list['evid'][indexOUT])
1
21187940
2
21410540
4
71565055
6
72590021
In [28]:
# prefix for this project
cre_prefix = "HFRC"
In [29]:
# making culster_file
#SEQID 9715 SUBNUM X HFRC0
#0 1984.069.223101 9652 9652 1984.069.223101 1995.236.113508 9715 30081294 HFRC0
#1 1984.069.223101 9714 9714 1984.069.223101 2001.100.100318 9715 21157672 HFRC0
#outfi = 'uw_model.ppmod'
#dgrid = 0.05 # 0.05 degress. matched to xgrid and ygrid
with open(cluster_fi , 'w') as f:
    for cre_index in range(1, max_id+1): # id starts from 1.
    #for cre_index in range(1, 10):
        pair_n = 0
        seqidOUT = cre_index - 1 # id starts from 1
        print("# seqidOUT = ",seqidOUT)
        print("# cre_index = ",cre_index)
        evid_index  = np.where(cre_id == cre_index)[0].tolist()
        print("# evid_index = ",evid_index)
        evid_index_num = len(evid_index)
        print("# evid_index_num = ",evid_index_num)

        for evid_n, indexOUT in enumerate(evid_index):
            print("# evid_n = ",evid_n," indexOUT = ",indexOUT) # index
            evidOUT = (evid_list['evid'][indexOUT])
            print("# evidOUT = ",evidOUT)

            if evid_n== 0: # header
                print("# header evid_n = ",evid_n)
                f.write('SEQID '+str(evidOUT)+" SUBNUN  "+str(evid_index_num)+" "+cre_prefix+str(seqidOUT)+ "\n")
            
            # loop for evid2. 
            #for evid_n2, indexOUT2 in enumerate(evid_index, evid_n+1):
            for evid_n2 in range( evid_n + 1 ,  len(evid_index) ):
                indexOUT2 = evid_index[evid_n2]
                print("# evid_n2 = ",evid_n2," indexOUT2 = ",indexOUT2) # index
                evidOUT2 = (evid_list['evid'][indexOUT2])
                print("# evidOUT2 = ",evidOUT2)
                coh_index = np.where( (coh['evid1'] == evidOUT)  & (coh['evid2'] == evidOUT2) |  (coh['evid2'] == evidOUT)  & (coh['evid1'] == evidOUT2) )[0].tolist()
                print("# coh_index =", coh_index)
                
                for coh_n, coh_indexOUT in enumerate(coh_index):
                    #print(coh_indexOUT)
                    #print(coh['evid1'][coh_indexOUT])
                    print(coh[coh_indexOUT:coh_indexOUT+1])
                    coh_select = coh[coh_indexOUT:coh_indexOUT+1]
                    #0 1984.069.223101 9652 9652 1984.069.223101 1995.236.113508 9715 30081294 HFRC0
                    #print(pair_n," ", (coh['ms_name1'][coh_indexOUT])," ", str(coh['coh1'][coh_indexOUT]), " ", str(coh['coh2'][coh_indexOUT]),
                    #      " ", (coh['ms_name1'][coh_indexOUT]), " ", (coh['ms_name2'][coh_indexOUT]), 
                    #      " ", (coh['evid1'][coh_indexOUT]), " ", (coh['evid2'][coh_indexOUT]), " ", cre_prefix+str(seqidOUT) )
                            
                    #f.write(str(pair_n)+" "+(coh['ms_name1'][coh_indexOUT]))
                    f.write(str(pair_n)+" "+(coh['ms_name1'][coh_indexOUT])+" "+str(coh['coh1'][coh_indexOUT])+" "+str(coh['coh2'][coh_indexOUT])   +" "+(coh['ms_name1'][coh_indexOUT])  +" "+(coh['ms_name2'][coh_indexOUT])+" "+str(coh['evid1'][coh_indexOUT]) +" "+str(coh['evid2'][coh_indexOUT])    +" "+cre_prefix+str(seqidOUT)  + "\n")
                            
                            #+" "+str(coh['coh2'][coh_indexOUT])+" "+(coh['ms_name1'][coh_indexOUT])+" "+(coh['ms_name2'][coh_indexOUT])+" "+(coh['evid1'][coh_indexOUT])+" "+(coh['evid2'][coh_indexOUT])+" "+cre_prefix+str(seqidOUT) )
                    pair_n = pair_n + 1

                    
        
    #f.write('UWBayArea' + "\n")
    #f.write(str(dgrid)+ "\n")
    #f.write('{:d} {:.2f} {:.2f}'.format(len(ygrid),ygrid[0],ygrid[-1])+ "\n")
    #f.write('{:d} {:.2f} {:.2f}'.format(len(xgrid),xgrid[0],xgrid[-1])+ "\n")
    #f.write('{:d} {:.2f} {:.2f}'.format(len(zgrid),zgrid[0],zgrid[-1])+ "\n")
    #f.write("-99 -99 -99 -99"+ "\n")
    #f.write(".TRUE."+ "\n")
#SEQNUM 1442
 
    f.write("SEQNUM "+ str(max_id) + "\n")
f.close()
# seqidOUT =  0
# cre_index =  1
# evid_index =  [1, 2, 4, 6]
# evid_index_num =  4
# evid_n =  0  indexOUT =  1
# evidOUT =  21187940
# header evid_n =  0
# evid_n2 =  1  indexOUT2 =  2
# evidOUT2 =  21410540
# coh_index = [2]
          ms_name1         ms_name2  coh1  coh2     evid1     evid2
2  2001.221.021038  2004.290.072635  9858  9858  21187940  21410540
# evid_n2 =  2  indexOUT2 =  4
# evidOUT2 =  71565055
# coh_index = [3]
          ms_name1         ms_name2  coh1  coh2     evid1     evid2
3  2001.221.021038  2011.125.090631  9788  9788  21187940  71565055
# evid_n2 =  3  indexOUT2 =  6
# evidOUT2 =  72590021
# coh_index = []
# evid_n =  1  indexOUT =  2
# evidOUT =  21410540
# evid_n2 =  2  indexOUT2 =  4
# evidOUT2 =  71565055
# coh_index = [4]
          ms_name1         ms_name2  coh1  coh2     evid1     evid2
4  2004.290.072635  2011.125.090631  9685  9685  21410540  71565055
# evid_n2 =  3  indexOUT2 =  6
# evidOUT2 =  72590021
# coh_index = [5]
          ms_name1         ms_name2  coh1  coh2     evid1     evid2
5  2004.290.072635  2016.038.220752  9564  9564  21410540  72590021
# evid_n =  2  indexOUT =  4
# evidOUT =  71565055
# evid_n2 =  3  indexOUT2 =  6
# evidOUT2 =  72590021
# coh_index = [6]
          ms_name1         ms_name2  coh1  coh2     evid1     evid2
6  2011.125.090631  2016.038.220752  9746  9746  71565055  72590021
# evid_n =  3  indexOUT =  6
# evidOUT =  72590021
# seqidOUT =  1
# cre_index =  2
# evid_index =  [0, 3, 5]
# evid_index_num =  3
# evid_n =  0  indexOUT =  0
# evidOUT =  21130389
# header evid_n =  0
# evid_n2 =  1  indexOUT2 =  3
# evidOUT2 =  51194429
# coh_index = []
# evid_n2 =  2  indexOUT2 =  5
# evidOUT2 =  71588265
# coh_index = [0]
          ms_name1         ms_name2  coh1  coh2     evid1     evid2
0  2000.292.230439  2011.172.174716  9708  9708  21130389  71588265
# evid_n =  1  indexOUT =  3
# evidOUT =  51194429
# evid_n2 =  2  indexOUT2 =  5
# evidOUT2 =  71588265
# coh_index = [1]
          ms_name1         ms_name2  coh1  coh2     evid1     evid2
1  2008.012.020128  2011.172.174716  9631  9631  51194429  71588265
# evid_n =  2  indexOUT =  5
# evidOUT =  71588265
In [30]:
# show results: two sequences: one has 4 events and other has 2 events
In [31]:
%%script tcsh
cat TEST.AL.HHZ..allCSs.10.24.xc.8-24Hz.3.max_dist.log.tt.coh.9500.linkage_cluster.out
SEQID 21187940 SUBNUN  4 HFRC0
0 2001.221.021038 9858 9858 2001.221.021038 2004.290.072635 21187940 21410540 HFRC0
1 2001.221.021038 9788 9788 2001.221.021038 2011.125.090631 21187940 71565055 HFRC0
2 2004.290.072635 9685 9685 2004.290.072635 2011.125.090631 21410540 71565055 HFRC0
3 2004.290.072635 9564 9564 2004.290.072635 2016.038.220752 21410540 72590021 HFRC0
4 2011.125.090631 9746 9746 2011.125.090631 2016.038.220752 71565055 72590021 HFRC0
SEQID 21130389 SUBNUN  3 HFRC1
0 2000.292.230439 9708 9708 2000.292.230439 2011.172.174716 21130389 71588265 HFRC1
1 2008.012.020128 9631 9631 2008.012.020128 2011.172.174716 51194429 71588265 HFRC1
SEQNUM 2
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: