Commit 4ebff29b authored by yugao@uic.edu's avatar yugao@uic.edu

updated faster splitter

parent 97e8cad5
......@@ -4,6 +4,7 @@ import zlib
import base64
import numpy as np
import time
import os
np_dtype_numtype = {'i': np.int32, 'e': np.single, 'f': np.float32, 'q': np.int64, 'd': np.float64}
......@@ -76,8 +77,8 @@ def mzml_splitter(mzml_file: str):
mzml_out_fp = open(mzml_out, 'w', newline='\n', encoding='utf-8')
# Create empty list for data temporary storage
data_position = []
data_format={}
data_position=[]
# Iterate mzML file and get all data into data_position[]
for event, elem in iter(ET.iterparse(mzml_file, events=('end',))):
if elem.tag.endswith('}cvParam'):
......@@ -89,10 +90,11 @@ def mzml_splitter(mzml_file: str):
data_type_dict['data_type'] = accession_dict[elem.get('accession')]
elif elem.tag.endswith('}binary'):
number_array = base64_decoder(elem.text, data_type_dict['data_encoding'], data_type_dict['data_compression'], data_type_dict['data_type'])
number_array = base64_lossy_decoder(elem.text, data_type_dict['data_encoding'], data_type_dict['data_compression'], data_type_dict['data_type'])
data_type_dict['data_number']=str(len(number_array))
data_position.append(([data_type_dict['data_number'], data_type_dict['data_encoding'], data_type_dict['data_compression'], data_type_dict['data_type']], number_array))
# Split mzml into smzml, binary mass and binary int
with open(mzml_file, 'r', encoding='utf-8') as mzml_in_fp:
i=0
......@@ -193,3 +195,10 @@ def mzml_decoder(smzml_file, bmass_file, bint_file, mzml_file):
mass_in_fp.close()
int_in_fp.close()
smzml_file.close()
if __name__ == '__main__':
file_name =r'K:\test\test\fusion_20200116_qjl_YLD_19.mzML'
start=time.time()
print(os.path.getsize(file_name)/1e9,"GB")
print(time.time()-start)
\ No newline at end of file
# mscompress-pub.mzml_splitter created by bathy at 9/24/2021
import mmap
import os
import numpy as np
import time
import base64
import zlib
from xml.etree import cElementTree as ET
np_dtype_numtype = {'i': np.int32, 'e': np.single, 'f': np.float32, 'q': np.int64, 'd': np.float64}
accession_dict = {"MS:1000519": "32i",
"MS:1000520": "16e",
"MS:1000521": "32f",
"MS:1000522": "64i",
"MS:1000523": "64d",
"MS:1000574": "zlib",
"MS:1000576": "no compression",
"MS:1000515": "intensity",
"MS:1000514": "mass"}
def find_string(file_name, match_tag_start, match_tag_end):
start_time=time.time()
data_positions=[]
with open(file_name, 'r', encoding='utf-8') as f:
# memory-map the file, size 0 means whole file
m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
# prot argument is *nix only
while True:
start = m.find(match_tag_start.encode('utf-8'))
if start==-1:
print("total time used to position all b64 data:",time.time()-start_time)
return data_positions
end = m.find(match_tag_end.encode('utf-8'))
data_positions.append((start+len(match_tag_start), end))
m.seek(end+len(match_tag_end))
def base64_decoder(base64_data:bytes, number_fmt, compress_method, array_type):
if base64_data is not None:
num_type = np_dtype_numtype[number_fmt[-1]]
decode_base64 = base64.decodebytes(base64_data)
if compress_method == 'zlib':
decode_base64 = zlib.decompress(decode_base64)
data = np.frombuffer(decode_base64, dtype=num_type)
#if array_type == 'intensity':
# data = np.log2(np.where(data>0.00001, data, 0.00001)/np.linalg.norm(data)) # performs log only on intensity
else:
data = np.array([])
return data
def mzml_splitter(mzml_file: str):
start=time.time()
# Generate file names
int_binary_file = mzml_file.replace('.mzML', '.bint')
mass_binary_file = mzml_file.replace('.mzML', '.bmass')
mzml_out = mzml_file.replace('.mzML', '.smzml')
# Open file pointers
mass_binary_out_fp = open(mass_binary_file, 'wb')
int_binary_out_fp = open(int_binary_file, 'wb')
mzml_out_fp = open(mzml_out, 'w', newline='\n', encoding='utf-8')
data_format={}
# Iterate mzML file and get all data into data_position[]
current_encoding = '32f'
current_compress = 'no compression'
for event, elem in iter(ET.iterparse(mzml_file, events=('start',))):
if elem.tag.endswith('}cvParam'):
if elem.get('accession').endswith(('MS:1000521', 'MS:1000522', 'MS:1000523', 'MS:1000519', 'MS:1000520')): # retrieves the datatype based on MS accession
current_encoding = accession_dict[elem.get('accession')]
if elem.get('accession').endswith(('MS:1000576', 'MS:1000574')): # retrieves the compression based on MS accession
current_compress = accession_dict[elem.get('accession')]
if elem.get('accession').endswith(('MS:1000515', 'MS:1000514')): # retrives array_type
current_type = accession_dict[elem.get('accession')]
data_format[current_type]={'data_encoding': current_encoding,'data_compression': current_compress,'data_type': current_type}
elif elem.tag.endswith('}spectrumList'):
spec_no=elem.get('count')
print(spec_no*2)
elif len(data_format.keys())==2:
print(data_format)
break
data_position=find_string(file_name,'<binary>', '</binary>')
with open(mzml_file, 'r', encoding='utf-8') as mzml_in_fp:
m = mmap.mmap(mzml_in_fp.fileno(), 0, access=mmap.ACCESS_READ)
i=0
for line in mzml_in_fp:
if line.lstrip().startswith('<binary>'):
mzml_out_fp.write(line.split('<binary>')[0])
if i%2==0:
fmt = data_format['mass']
else:
fmt = data_format['intensity']
m.seek(data_position[i][0])
b64_content_byte=m.read(data_position[i][1]-data_position[i][0])
number_array = base64_decoder(b64_content_byte, fmt['data_encoding'], fmt['data_compression'], fmt['data_type'])
mzml_out_fp.write("<binary>$%s$</binary>\n" % '$'.join([str(len(number_array)), fmt['data_encoding'], fmt['data_compression'], fmt['data_type']]))
number_fmt = np_dtype_numtype[fmt['data_encoding'][-1]]
if fmt['data_type']=='mass':
mass_binary_out_fp.write(number_array.astype(number_fmt))
else:
int_binary_out_fp.write(number_array.astype(number_fmt))
i+=1
else:
mzml_out_fp.write(line)
# Close file pointers
mzml_out_fp.close()
int_binary_out_fp.close()
mass_binary_out_fp.close()
if __name__ == '__main__':
file_name =r'K:\test\test\fusion_20200116_qjl_YLD_19.mzML'
start=time.time()
print(os.path.getsize(file_name)/1e9,"GB")
print(mzml_splitter(file_name))
print(time.time()-start)
# mscompress-pub.tester created by bathy at 9/23/2021
import mzml_reader as mzr
mzml_file = r'K:\test\32bit_uncompressed_centroid\fusion_20200116_qjl_YLD_19.mzML'
mzml_file = r'K:\test\64bit_zlib_profile\fusion_20200116_qjl_YLD_19.mzML'
smzml_file = r'J:\mass_cloud\projects_folder\1000111\test\fusion_20200116_qjl_YLD_19.smzml'
bmass_file = r'J:\mass_cloud\projects_folder\1000111\test\fusion_20200116_qjl_YLD_19.bmass'
bint_file = r'J:\mass_cloud\projects_folder\1000111\test\fusion_20200116_qjl_YLD_19.bint'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment