Commit 3a7ff6a9 authored by yugao@uic.edu's avatar yugao@uic.edu

updated even faster splitter, binary data not completed.

parent 4ebff29b
......@@ -3,7 +3,7 @@ import mmap
import os
import numpy as np
import time
import base64
import pybase64
import zlib
from xml.etree import cElementTree as ET
......@@ -20,29 +20,45 @@ accession_dict = {"MS:1000519": "32i",
"MS:1000514": "mass"}
def find_string(file_name, match_tag_start, match_tag_end):
def find_string(file_name, match_tag_start, match_tag_end, data_format, spec_no):
start_time=time.time()
file_size = os.path.getsize(file_name)
encoded_start_tag = match_tag_start.encode('utf-8')
encoded_end_tag = match_tag_end.encode('utf-8')
len_start_tag=len(match_tag_start)
len_end_tag = len(match_tag_end)
data_positions=[]
with open(file_name, 'r', encoding='utf-8') as f:
mass_fmt = ("$%s$" % '$'.join([data_format['mass']['data_encoding'], data_format['mass']['data_compression'], data_format['mass']['data_type']])).encode('utf-8')
int_fmt = ("$%s$" % '$'.join([data_format['intensity']['data_encoding'], data_format['intensity']['data_compression'], data_format['intensity']['data_type']])).encode('utf-8')
i=0
with open(file_name, 'r', encoding='utf-8') as f, open(file_name.replace('.mzML', '.smzml'), 'wb') as fo:
# memory-map the file, size 0 means whole file
m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
# prot argument is *nix only
last_end=0
while True:
start = m.find(match_tag_start.encode('utf-8'))
if start==-1:
if i==spec_no*2:
print("total time used to position all b64 data:",time.time()-start_time)
fo.write(m.read(file_size-m.tell()))
return data_positions
end = m.find(match_tag_end.encode('utf-8'))
data_positions.append((start+len(match_tag_start), end))
m.seek(end+len(match_tag_end))
start = m.find(encoded_start_tag)
fo.write(m.read(start + len_start_tag - last_end))
m.seek(start)
end = m.find(encoded_end_tag)
data_positions.append((start+len_start_tag, end))
m.seek(end+len_end_tag)
if i%2==0:
fo.write(mass_fmt)
else:
fo.write(int_fmt)
fo.write(b'</binary>')
last_end=end + len_end_tag
i+=1
def base64_decoder(base64_data:bytes, number_fmt, compress_method, array_type):
if base64_data is not None:
num_type = np_dtype_numtype[number_fmt[-1]]
decode_base64 = base64.decodebytes(base64_data)
decode_base64 = pybase64._pybase64.b64decode_as_bytearray(base64_data)
if compress_method == 'zlib':
decode_base64 = zlib.decompress(decode_base64)
data = np.frombuffer(decode_base64, dtype=num_type)
......@@ -63,13 +79,14 @@ def mzml_splitter(mzml_file: str):
# Open file pointers
mass_binary_out_fp = open(mass_binary_file, 'wb')
int_binary_out_fp = open(int_binary_file, 'wb')
mzml_out_fp = open(mzml_out, 'w', newline='\n', encoding='utf-8')
#mzml_out_fp = open(mzml_out, 'w', newline='\n', encoding='utf-8')
data_format={}
# Iterate mzML file and get all data into data_position[]
current_encoding = '32f'
current_compress = 'no compression'
spec_no=0
for event, elem in iter(ET.iterparse(mzml_file, events=('start',))):
if elem.tag.endswith('}cvParam'):
if elem.get('accession').endswith(('MS:1000521', 'MS:1000522', 'MS:1000523', 'MS:1000519', 'MS:1000520')): # retrieves the datatype based on MS accession
......@@ -81,42 +98,55 @@ def mzml_splitter(mzml_file: str):
data_format[current_type]={'data_encoding': current_encoding,'data_compression': current_compress,'data_type': current_type}
elif elem.tag.endswith('}spectrumList'):
spec_no=elem.get('count')
print(spec_no*2)
spec_no=int(elem.get('count'))
print("Total binary data:",spec_no*2)
elif len(data_format.keys())==2:
print(data_format)
print("Mass and intensity data format", data_format)
break
data_position=find_string(file_name,'<binary>', '</binary>')
with open(mzml_file, 'r', encoding='utf-8') as mzml_in_fp:
m = mmap.mmap(mzml_in_fp.fileno(), 0, access=mmap.ACCESS_READ)
i=0
for line in mzml_in_fp:
if line.lstrip().startswith('<binary>'):
mzml_out_fp.write(line.split('<binary>')[0])
if i%2==0:
fmt = data_format['mass']
else:
fmt = data_format['intensity']
m.seek(data_position[i][0])
b64_content_byte=m.read(data_position[i][1]-data_position[i][0])
number_array = base64_decoder(b64_content_byte, fmt['data_encoding'], fmt['data_compression'], fmt['data_type'])
mzml_out_fp.write("<binary>$%s$</binary>\n" % '$'.join([str(len(number_array)), fmt['data_encoding'], fmt['data_compression'], fmt['data_type']]))
number_fmt = np_dtype_numtype[fmt['data_encoding'][-1]]
if fmt['data_type']=='mass':
mass_binary_out_fp.write(number_array.astype(number_fmt))
else:
int_binary_out_fp.write(number_array.astype(number_fmt))
i+=1
else:
mzml_out_fp.write(line)
data_position=find_string(mzml_file,'<binary>', '</binary>', data_format, spec_no)
# with open(mzml_file, 'r', encoding='utf-8') as mzml_in_fp:
# #m = mmap.mmap(mzml_in_fp.fileno(), 0, access=mmap.ACCESS_READ)
# i=0
# for line in mzml_in_fp:
# if line.lstrip().startswith('<binary>') and i< spec_no*2:
# mzml_out_fp.write(line.split('<binary>')[0])
#
# if i%2==0:
# fmt = data_format['mass']
# else:
# fmt = data_format['intensity']
#
# # m.seek(data_position[i][0])
# # b64_content_byte = m.read(data_position[i][1] - data_position[i][0])
# # number_array = base64_decoder(b64_content_byte, fmt['data_encoding'], fmt['data_compression'], fmt['data_type'])
# # number_fmt = np_dtype_numtype[fmt['data_encoding'][-1]]
# # if fmt['data_type'] == 'mass':
# # mass_binary_out_fp.write(number_array.astype(number_fmt))
# # else:
# # int_binary_out_fp.write(number_array.astype(number_fmt))
#
# mzml_out_fp.write("<binary>$%s$</binary>\n" % '$'.join([fmt['data_encoding'], fmt['data_compression'], fmt['data_type']]))
# i+=1
# else:
# mzml_out_fp.write(line)
# m = mmap.mmap(mzml_in_fp.fileno(), 0, access=mmap.ACCESS_READ)
# m.seek(data_position[i][0])
# b64_content_byte = m.read(data_position[i][1] - data_position[i][0])
# number_array = base64_decoder(b64_content_byte, fmt['data_encoding'], fmt['data_compression'], fmt['data_type'])
# number_fmt = np_dtype_numtype[fmt['data_encoding'][-1]]
# if fmt['data_type'] == 'mass':
# mass_binary_out_fp.write(number_array.astype(number_fmt))
# else:
# int_binary_out_fp.write(number_array.astype(number_fmt))
# Close file pointers
mzml_out_fp.close()
#mzml_out_fp.close()
int_binary_out_fp.close()
mass_binary_out_fp.close()
......@@ -124,7 +154,7 @@ def mzml_splitter(mzml_file: str):
if __name__ == '__main__':
file_name =r'K:\test\test\fusion_20200116_qjl_YLD_19.mzML'
start=time.time()
print(os.path.getsize(file_name)/1e9,"GB")
print(mzml_splitter(file_name))
print("Total file size",os.path.getsize(file_name)/1e9,"GB")
mzml_splitter(file_name)
print(time.time()-start)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment