Data analysis/TCGA

[TCGA] clinical data parsing code

jjbang 2017. 5. 15. 17:10


1. 


import os

import sys

import glob

from xml.dom import minidom

import xml.etree.ElementTree as ET

import itertools

sys.path.insert(0, '/Volumes/Data1/code repository/lib')

import tcgaxml



def down(f_name):

f = open(f_name,'r')

for f_line in f.xreadlines():

f_sp = f_line.strip().split('\t')

#os.system('./gdc-client download  '+ f_sp[0])

#down('/Volumes/Data1/part2/data/TCGA/tmp/gdc_manifest.2017-05-15T04-48-25.412702.tsv')



def move(f_dir_name,m_dir_name):

f_dir = glob.glob(f_dir_name)

for fd_line in f_dir:

fd1 = glob.glob(fd_line + '/*.xml')

os.system('mv ' + fd1[0] + ' ' + m_dir_name)



c_dir = '/Volumes/Data1/part2/data/TCGA/clinical/row/*'

m_dir = '/Volumes/Data1/part2/data/TCGA/clinical/20170515_clinical_row/'

#move(c_dir,m_dir)



re_dir =  '/Volumes/Data1/part2/data/TCGA/clinical/20170515_par.txt'

data_type = 'gbm:patient'

b ='gbm:follow_ups'


tcgaxml.run(re_dir,m_dir + '*', data_type,b)



"""

query.exp <- GDCquery(project = "TCGA-GBM",

data.category = "Transcriptome Profiling",

            data.type = "Gene Expression Quantification", 

            workflow.type = "HTSeq - FPKM-UQ")



"""

#down('/Volumes/Data1/part2/data/TCGA/tmp/gdc_manifest.2017-05-15T04-48-25.412702.tsv')


2. library(tcgaxml.py)

from xml.dom import minidom

import glob

import xml.etree.ElementTree as ET

import itertools



def get_ele(e_lst,title,e):

tmp=['NA']*len(e_lst)

for i in range(0,len(e_lst)):

if e_lst[i] in e[0].toxml():

dat=  e[0].getElementsByTagName(title + e_lst[i])[0]

if dat.firstChild != None:

dat = dat.firstChild.data

tmp[i] =  str(dat)

return tmp


def dir_ele(e_lst,title,up_tag):

tmp=['NA']*len(e_lst)

for i in range(0,len(e_lst)):

if e_lst[i] in up_tag[0].toxml():

dat=  up_tag[0].getElementsByTagName(title + e_lst[i])[0]

if dat.firstChild != None:

dat = dat.firstChild.data

tmp[i] =  str(dat)

return tmp


def drug_ele(e_lst,title,up_tag,j):

tmp=['NA']*len(e_lst)

for i in range(0,len(e_lst)):

if e_lst[i] in up_tag[j].toxml():

dat=  up_tag[j].getElementsByTagName(title + e_lst[i])[0]

if dat.firstChild != None:

dat = dat.firstChild.data

tmp[i] =  str(dat)

return tmp


c = []


f_n = '/Volumes/data/part1/clinical data/row/nationwidechildrens.org_clinical.TCGA-3C-AAAU.xml'

e_lst = ['lost_follow_up','radiation_therapy','postoperative_rx_tx','days_to_last_followup','vital_status','days_to_death','day_of_form_completion','month_of_form_completion','year_of_form_completion']

n_lst = ['days_to_new_tumor_event_after_initial_treatment','new_neoplasm_event_type','new_neoplasm_event_occurrence_anatomic_site','new_tumor_event_additional_surgery_procedure','additional_radiation_therapy','additional_pharmaceutical_therapy']

s_lst = ['bcr_patient_barcode','gender']

c_lst = ['tumor_tissue_site','days_to_birth','race','ethnicity','age_at_initial_pathologic_diagnosis']

sh_lst = ['clinical_stage','pathologic_stage']

b_lst = ['breast_carcinoma_surgical_procedure_name']

_1_st_lst = ['days_to_last_known_alive','radiation_therapy','postoperative_rx_tx','vital_status','days_to_death','day_of_form_completion','month_of_form_completion','year_of_form_completion']

d_name = ['therapy_type','bcr_drug_barcode','drug_name','total_dose','total_dose_units','prescribed_dose','prescribed_dose_units','number_cycles','days_to_drug_therapy_start','days_to_drug_therapy_end','therapy_ongoing''measure_of_response']

d1_name = ['day_of_form_completion','month_of_form_completion','year_of_form_completion']

d_line = 'rx:'

d1_line = 'clin_shared:'


def run(re_dir,input_dir, data_type,b):

re = open(re_dir,'w')

f_lst = glob.glob(input_dir)

f_v = []

seq_v = []

d_c = []

for t_line in f_lst:

d = []

t = open(t_line,'r')

for t_line in t.xreadlines():

ts= t_line.strip()

if ts.startswith('<follow_up_v'):

f_v.append(ts.split(' ')[1].split('=')[1].replace('"',''))

seq_v.append(ts.split(' ')[2].split('=')[1].strip('>').replace('"',''))

if ts.startswith('<rx:drug>'):

d.append(ts)

d_c.append(len(d))

fup_point = list(set(f_v))

f_level= list(set(seq_v))

f_level.sort()

f_num =  len(fup_point) * len(f_level)

print "flolow up check : ",list(set(f_v)), list(set(seq_v)),f_num

print max(d_c)



n_t = []

for n_line in list(itertools.product(fup_point,f_level)):

n_t.append(n_line[0] + '_' + str(n_line[1])+ ':'+ ':'.join(e_lst + n_lst))


header = '\t'.join(s_lst + c_lst + sh_lst + b_lst + _1_st_lst + n_lst) + '\t' + '\t'.join(n_t) + '\t' + ':'.join(d_name + d1_name)

re.write(header + '\n')


x = 1

for f_n in f_lst:

p_code =  f_n.split('/')[-1].split('.')[2]

num_dat = len(e_lst) + len(n_lst) + 2

with open(f_n, 'r') as myfile:

data=myfile.read().replace('\n', '')

tree = ET.parse(f_n).getroot()

xmlraw = minidom.parseString(data)


#demograph + 1st test

sentenceList = xmlraw.getElementsByTagName(data_type)

t_dat1 = [] 

for l_line, l_name in zip(['shared:','clin_shared:','shared_stage:','brca:','clin_shared:','nte:'], [s_lst,c_lst,sh_lst,b_lst,_1_st_lst,n_lst]):

t_dat1 =  t_dat1  + dir_ele(l_name,l_line,sentenceList) 

#print t_dat1


#follow up

pa = xmlraw.getElementsByTagName(b) 

t_dat = [':'.join(['NA'] * num_dat)] * f_num

order = list(itertools.product(fup_point,f_level))


for i in range(1,len(t_dat)):

ver = order[i][0]

seq = order[i][1] 

e = pa[0].getElementsByTagName('follow_up_v' + ver + ':follow_up')#Number of followup

if len(e) > 0:

for j in range(0,len(e)):

if str(seq)==str(e[j].getAttribute('sequence')):

seq1 = e[j].getAttribute('sequence')

t_dat[i] = ':'.join([ver] + [str(seq)] + get_ele(e_lst,'clin_shared:',e) + get_ele(n_lst,'nte:',e))


#print '\t'.join(t_dat)


#drug

d1 = xmlraw.getElementsByTagName('rx:drugs')

d2 = d1[0].getElementsByTagName('rx:drug')

tmp_dat2 = ['NA'] * (max(d_c) + 1)

if len(d2) > 0:

for j in range(0,len(d2)):

tmp_dat2[j] =  ':'.join(drug_ele(d_name,d_line,d2,j) + drug_ele(d1_name,d1_line,d2,j))

re.write('\t'.join(t_dat1  + t_dat + tmp_dat2) + '\n')

print x,'          ',t_dat1[0]

x+=1


if __name__ == '__main__':

print "test"