[TCGA] clinical data parsing code
1.
import os
import sys
import glob
from xml.dom import minidom
import xml.etree.ElementTree as ET
import itertools
sys.path.insert(0, '/Volumes/Data1/code repository/lib')
import tcgaxml
def down(f_name):
f = open(f_name,'r')
for f_line in f.xreadlines():
f_sp = f_line.strip().split('\t')
#os.system('./gdc-client download '+ f_sp[0])
#down('/Volumes/Data1/part2/data/TCGA/tmp/gdc_manifest.2017-05-15T04-48-25.412702.tsv')
def move(f_dir_name,m_dir_name):
f_dir = glob.glob(f_dir_name)
for fd_line in f_dir:
fd1 = glob.glob(fd_line + '/*.xml')
os.system('mv ' + fd1[0] + ' ' + m_dir_name)
c_dir = '/Volumes/Data1/part2/data/TCGA/clinical/row/*'
m_dir = '/Volumes/Data1/part2/data/TCGA/clinical/20170515_clinical_row/'
#move(c_dir,m_dir)
re_dir = '/Volumes/Data1/part2/data/TCGA/clinical/20170515_par.txt'
data_type = 'gbm:patient'
b ='gbm:follow_ups'
tcgaxml.run(re_dir,m_dir + '*', data_type,b)
"""
query.exp <- GDCquery(project = "TCGA-GBM",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - FPKM-UQ")
"""
#down('/Volumes/Data1/part2/data/TCGA/tmp/gdc_manifest.2017-05-15T04-48-25.412702.tsv')
2. library(tcgaxml.py)
from xml.dom import minidom
import glob
import xml.etree.ElementTree as ET
import itertools
def get_ele(e_lst,title,e):
tmp=['NA']*len(e_lst)
for i in range(0,len(e_lst)):
if e_lst[i] in e[0].toxml():
dat= e[0].getElementsByTagName(title + e_lst[i])[0]
if dat.firstChild != None:
dat = dat.firstChild.data
tmp[i] = str(dat)
return tmp
def dir_ele(e_lst,title,up_tag):
tmp=['NA']*len(e_lst)
for i in range(0,len(e_lst)):
if e_lst[i] in up_tag[0].toxml():
dat= up_tag[0].getElementsByTagName(title + e_lst[i])[0]
if dat.firstChild != None:
dat = dat.firstChild.data
tmp[i] = str(dat)
return tmp
def drug_ele(e_lst,title,up_tag,j):
tmp=['NA']*len(e_lst)
for i in range(0,len(e_lst)):
if e_lst[i] in up_tag[j].toxml():
dat= up_tag[j].getElementsByTagName(title + e_lst[i])[0]
if dat.firstChild != None:
dat = dat.firstChild.data
tmp[i] = str(dat)
return tmp
c = []
f_n = '/Volumes/data/part1/clinical data/row/nationwidechildrens.org_clinical.TCGA-3C-AAAU.xml'
e_lst = ['lost_follow_up','radiation_therapy','postoperative_rx_tx','days_to_last_followup','vital_status','days_to_death','day_of_form_completion','month_of_form_completion','year_of_form_completion']
n_lst = ['days_to_new_tumor_event_after_initial_treatment','new_neoplasm_event_type','new_neoplasm_event_occurrence_anatomic_site','new_tumor_event_additional_surgery_procedure','additional_radiation_therapy','additional_pharmaceutical_therapy']
s_lst = ['bcr_patient_barcode','gender']
c_lst = ['tumor_tissue_site','days_to_birth','race','ethnicity','age_at_initial_pathologic_diagnosis']
sh_lst = ['clinical_stage','pathologic_stage']
b_lst = ['breast_carcinoma_surgical_procedure_name']
_1_st_lst = ['days_to_last_known_alive','radiation_therapy','postoperative_rx_tx','vital_status','days_to_death','day_of_form_completion','month_of_form_completion','year_of_form_completion']
d_name = ['therapy_type','bcr_drug_barcode','drug_name','total_dose','total_dose_units','prescribed_dose','prescribed_dose_units','number_cycles','days_to_drug_therapy_start','days_to_drug_therapy_end','therapy_ongoing''measure_of_response']
d1_name = ['day_of_form_completion','month_of_form_completion','year_of_form_completion']
d_line = 'rx:'
d1_line = 'clin_shared:'
def run(re_dir,input_dir, data_type,b):
re = open(re_dir,'w')
f_lst = glob.glob(input_dir)
f_v = []
seq_v = []
d_c = []
for t_line in f_lst:
d = []
t = open(t_line,'r')
for t_line in t.xreadlines():
ts= t_line.strip()
if ts.startswith('<follow_up_v'):
f_v.append(ts.split(' ')[1].split('=')[1].replace('"',''))
seq_v.append(ts.split(' ')[2].split('=')[1].strip('>').replace('"',''))
if ts.startswith('<rx:drug>'):
d.append(ts)
d_c.append(len(d))
fup_point = list(set(f_v))
f_level= list(set(seq_v))
f_level.sort()
f_num = len(fup_point) * len(f_level)
print "flolow up check : ",list(set(f_v)), list(set(seq_v)),f_num
print max(d_c)
n_t = []
for n_line in list(itertools.product(fup_point,f_level)):
n_t.append(n_line[0] + '_' + str(n_line[1])+ ':'+ ':'.join(e_lst + n_lst))
header = '\t'.join(s_lst + c_lst + sh_lst + b_lst + _1_st_lst + n_lst) + '\t' + '\t'.join(n_t) + '\t' + ':'.join(d_name + d1_name)
re.write(header + '\n')
x = 1
for f_n in f_lst:
p_code = f_n.split('/')[-1].split('.')[2]
num_dat = len(e_lst) + len(n_lst) + 2
with open(f_n, 'r') as myfile:
data=myfile.read().replace('\n', '')
tree = ET.parse(f_n).getroot()
xmlraw = minidom.parseString(data)
#demograph + 1st test
sentenceList = xmlraw.getElementsByTagName(data_type)
t_dat1 = []
for l_line, l_name in zip(['shared:','clin_shared:','shared_stage:','brca:','clin_shared:','nte:'], [s_lst,c_lst,sh_lst,b_lst,_1_st_lst,n_lst]):
t_dat1 = t_dat1 + dir_ele(l_name,l_line,sentenceList)
#print t_dat1
#follow up
pa = xmlraw.getElementsByTagName(b)
t_dat = [':'.join(['NA'] * num_dat)] * f_num
order = list(itertools.product(fup_point,f_level))
for i in range(1,len(t_dat)):
ver = order[i][0]
seq = order[i][1]
e = pa[0].getElementsByTagName('follow_up_v' + ver + ':follow_up')#Number of followup
if len(e) > 0:
for j in range(0,len(e)):
if str(seq)==str(e[j].getAttribute('sequence')):
seq1 = e[j].getAttribute('sequence')
t_dat[i] = ':'.join([ver] + [str(seq)] + get_ele(e_lst,'clin_shared:',e) + get_ele(n_lst,'nte:',e))
#print '\t'.join(t_dat)
#drug
d1 = xmlraw.getElementsByTagName('rx:drugs')
d2 = d1[0].getElementsByTagName('rx:drug')
tmp_dat2 = ['NA'] * (max(d_c) + 1)
if len(d2) > 0:
for j in range(0,len(d2)):
tmp_dat2[j] = ':'.join(drug_ele(d_name,d_line,d2,j) + drug_ele(d1_name,d1_line,d2,j))
re.write('\t'.join(t_dat1 + t_dat + tmp_dat2) + '\n')
print x,' ',t_dat1[0]
x+=1
if __name__ == '__main__':
print "test"