Commit e6a7b13d authored by yugao@uic.edu's avatar yugao@uic.edu

added decoder function in mzml_splitter.py

Need to do:
1.  multi-tread zlib?
parent e4ccaf8e
......@@ -7,6 +7,8 @@ import pybase64
import zlib
from xml.etree import cElementTree as ET
np_dtype_numtype = {'i': np.int32, 'e': np.single, 'f': np.float32, 'q': np.int64, 'd': np.float64}
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
......@@ -17,12 +19,12 @@ def chunks(l, n):
def decode_pos(data_position, mzml_fp, data_fmt, array_type):
data_start, data_end = data_position
mzml_fp.seek(data_start)
data = mzml_fp.read(data_end-data_start)
return base64_decoder(data, data_fmt['data_encoding'], data_fmt['data_compression'], array_type)
data = mzml_fp.read(data_end - data_start)
num_data, num_array = base64_decoder(data, data_fmt['data_encoding'], data_fmt['data_compression'], array_type)
return num_data, num_array
def base64_decoder(base64_data: bytes, number_fmt, compress_method, array_type):
np_dtype_numtype = {'i': np.int32, 'e': np.single, 'f': np.float32, 'q': np.int64, 'd': np.float64}
if base64_data is not None:
num_type = np_dtype_numtype[number_fmt[-1]]
decode_base64 = pybase64._pybase64.b64decode_as_bytearray(base64_data)
......@@ -32,8 +34,15 @@ def base64_decoder(base64_data: bytes, number_fmt, compress_method, array_type):
# if array_type == 'intensity':
# data = np.log2(np.where(data>0.00001, data, 0.00001)/np.linalg.norm(data)) # performs log only on intensity
else:
data = np.array([])
return data
data = 0, np.array([])
return len(data), data
def base64_encoder(number_array: np.ndarray, compress_method: str):
byte_data = number_array.tobytes()
if compress_method == 'zlib':
byte_data = zlib.compress(byte_data)
return pybase64.b64encode(byte_data).decode('ascii').replace('\n', '')
accession_dict = {"MS:1000519": "32i",
......@@ -88,7 +97,7 @@ def mzml_splitter(mzml_file: str):
# Generate file names
int_binary_file = mzml_file.replace('.mzML', '.bint')
mass_binary_file = mzml_file.replace('.mzML', '.bmass')
#mzml_out = mzml_file.replace('.mzML', '.smzml')
# mzml_out = mzml_file.replace('.mzML', '.smzml')
# Open file pointers
mass_binary_out_fp = open(mass_binary_file, 'wb')
......@@ -121,20 +130,85 @@ def mzml_splitter(mzml_file: str):
break
data_position = find_string(mzml_read_fp, '<binary>', '</binary>', data_format, spec_no)
data_chunks = chunks(data_position,2)
data_chunks = chunks(data_position, 2)
mass_num_data_list = []
int_num_data_list = []
for each_data in data_chunks:
mass_pos, int_pos = each_data
mass_binary_out_fp.write(decode_pos(mass_pos, mzml_read_fp, data_format['mass'], 'mass'))
int_binary_out_fp.write(decode_pos(mass_pos, mzml_read_fp, data_format['intensity'], 'intensity'))
mass_num_data, mass_data_to_write = decode_pos(mass_pos, mzml_read_fp, data_format['mass'], 'mass')
mass_binary_out_fp.write(mass_data_to_write)
mass_num_data_list.append(mass_num_data)
int_num_data, int_data_to_write = decode_pos(int_pos, mzml_read_fp, data_format['intensity'], 'intensity')
int_binary_out_fp.write(int_data_to_write)
int_num_data_list.append(int_num_data)
mass_num_data_list.append(len(mass_num_data_list))
int_num_data_list.append(len(int_num_data_list))
mass_binary_out_fp.write(np.array(mass_num_data_list, dtype=np.int32).tobytes())
int_binary_out_fp.write(np.array(int_num_data_list, dtype=np.int32).tobytes())
mzml_read_fp.close()
int_binary_out_fp.close()
mass_binary_out_fp.close()
def mzml_decoder(smzml_file, bmass_file, bint_file, mzml_file):
# Create file pointers
mass_in_fp = open(bmass_file, 'rb')
int_in_fp = open(bint_file, 'rb')
smzml_file = open(smzml_file, 'r', encoding='utf-8')
int_in_fp.seek(-4, os.SEEK_END)
total_spec_no = int.from_bytes(int_in_fp.read(4), byteorder='little')
int_in_fp.seek(-4 * (total_spec_no + 1), os.SEEK_END)
spec_no_array = np.frombuffer(int_in_fp.read(4 * total_spec_no), np.int32)
int_in_fp.seek(0)
i = 0
# Restore mzML file from smzml
with open(mzml_file, 'w', newline='\n', encoding='utf-8') as f_out:
for line in smzml_file:
if '<binary>' in line and i < total_spec_no:
f_out.write(line.split('<binary>')[0])
_, data_fmt, data_compression, data_type, _ = line.split('$')
data_num = spec_no_array[i]
if data_type == 'mass':
number_array = np.frombuffer(mass_in_fp.read(int(int(data_fmt[:2]) * int(data_num) / 8)), np_dtype_numtype[data_fmt[-1]])
else:
read_number=int(int(data_fmt[:2]) * int(data_num) / 8)
number_array = np.frombuffer(int_in_fp.read(read_number), np_dtype_numtype[data_fmt[-1]])
i += 1
f_out.write('<binary>%s</binary>\n' % base64_encoder(number_array, data_compression))
else:
f_out.write(line)
# Close file pointers
mass_in_fp.close()
int_in_fp.close()
smzml_file.close()
if __name__ == '__main__':
file_name = r'K:\test\test\fusion_20200116_qjl_YLD_19.mzML'
start = time.time()
print("Total file size", os.path.getsize(file_name) / 1e9, "GB")
mzml_splitter(file_name)
#mzml_splitter(file_name)
smzml_file = r'K:\test\test\fusion_20200116_qjl_YLD_19.smzml'
bmass_file = r'K:\test\test\fusion_20200116_qjl_YLD_19.bmass'
bint_file = r'K:\test\test\fusion_20200116_qjl_YLD_19.bint'
out_mzml = r'K:\test\test\fusion_20200116_qjl_YLD_19_out.mzML'
mzml_decoder(smzml_file, bmass_file, bint_file, out_mzml)
b64_test='eJyFmntUVPUWx2EEAyVUHHAUzEpuoshjCBEkQGDOIPciKg/HxIpUiMy0h4gX9Uq+QgMEJHwhKEvAfMVCuSpMmkKjiaHxSFLkWYpIvCR8wMy9a2qt1mLV5/T793u++3XOb+/9279jYMDLsWSvkrCQ/O8VhF13cEKe3frTiLWFjxQIW9Wbhjx1rg3yCspnITbipSb04Z9h3Ygt0jT/IbM1NSGQntOvxOypwtS2XwIIT7okoF9V63MRc1jojX51v3ADbZf1f4G8tUcExAa1LPNISAVidp3b0IengY2ob2pOEPuXegv1WXXZYqw3vGOCMpetmYbY3PxxiNkl2KO+wjHzkfe4Jwx5+mXimYxx21V1AuWuU7oj9vWjnzBufTcPIa+x4jbacjBuJmIr/leA+opcqhAb5++Ithz6x03kSayfIhZaXY7YlBwT9MFz8q+ISbQNaKfjds0QfY2HHPDZIHNjxGwNghFrW8t2r9iVLZi/O0Lo267DZ2ZLOP9tzAhEvd9reW9WrXhVmRfrh3jaFd5n9wwNEYsZxTXF+Eo28vRr39ymOYSNdeWac3OZrahcxSMJytUvj46M3/irnTNxrwfFZqBfY5uno/5c7++Q1zd/NPJ6Kwy4xtZyzTNOlKC+phSuv4Umd4bIdLfrQh01KTaoo0sjRx1XAl5Fno/kDPKsbHVoS5KfN8pc/Y4Hyvz8aBTy6u43K1KC+Ru/UH0SsdoQf8Q0NT74fc3buxvtaf/pNMqc9MNm5GVGKpFX7N4uuicSuq9jzM/uGCZaE40mWIrKNlWZoc1Rd3lPGN8xVF5dv11Ut34NXrBXatScq52udaBvSs0NhVsax7QoZDRiD6043lYjOUd9dIdtPTfYyrnfnXuI2nTehyXXucY5BE1E/yJrpMibFcr6OodPRKyjRsv96mfm+K5vx09CO+c4T0deY8xW/vaeWqKdKQkViP13/nOI5YeWInYgmvNa+su8RzaVcG4KTRyDmNsRK5SZ/HgS8la43GY7n/dCXtHMH5C35I4asaXhXPuaVefQB/06kcA53yCQz007kzne0S7DkXdKOhl5s9pNkSd/g30cPfkuxiZGXYa8Bdcs8LtP32KJdsZOb0Z9VTL+tjc6tiNPvxoS8+fs1bqK9mb6tXx/n2hOH+PK/c7xECn6lfbzRaG8eJOobPtKuWD2bQ7KGB5bgPb/57CRELduFOIe6mGIZUbwuSp3H8e11aYasbLwh6Lv43JHBM5A7rbI0NaaxwWi9Vy/FnYE4zNp0ftRdlPXfGFg8TbRd6Tb9gW+n9dvce9dZ8o5bYlVBMo0UW3lOvCsAWN8/uMZHMOwR8g7oOH8at5dGpCd7SS6h6IGbTB+GfN4/le3heeGx55vRp1XvRKVvpbcz2gPmgmyRT2od1oq58UP22/wHGpuAmLWpXIhynQl98KbfVDn2P05nFMDMxGL9Pzr3Pik/zXk+C8tQxtt9/B7VllMYBvnXcBY63bMHqIvIuj3fNXvx7Mpr4VnUV5wShLyzLVlyOty7kTsjepv+XznU8O+aesQ63fnPt0t5UfkJTtzr5oju4n7zFKt4O8kXY68sH7OCzrfpcrMvp6/9ON4zUiU+VYM14i8OaeFJKfhGJstUWGIxfXvE1Imco5rsB0QzVMuody7mY8wREy2g/uhNS3Jf1ubZBuM8Zn85tmC0dli0fpzxigX/VJr+tDudXu4/z3/I/eN1ZJv/sRzLuQ90nmJ59769Yt0mWiMBg+Wifr/ZX2iKF5VXIy+zCg5j7ZlvGTO91bpvehvfDjnkoYHzmjrtEv7UV/HBj/Eypt4v5SqLqJ/iz3uo53e8XwXs2sZn9Nu7eG53ImMB0Owhl6e950slyAWu4rnA08O8nk9yugw2n1f6oqY73EPxL4q1gnnNh4eYo97z2v4jrfGvch5OLec7zQNn3F9WvA+80pqUd++SD47nfisju+x/p01BMv3vYzP+up2c18wc3uANGQq4h/cMMY4lvjmoc+DT1oFCx2fs5OncI0wDDXgumz6DHlHVb9yH3v4nmh+Oro89G9nhg7BbqLPVBzjOdxbHjy7XXzqId8LvcJ7yb6Nv52+D8wQix7jgtjsnfaI+ZnoELOozEOsMolzjN/uuRjT0MFC0TmOzOuMKB7iwndqXwbwHE9WwXeW8gNtPKM1G1BkSU2xjna/Nx59tVk+A21decUaednHPkGea8gaxHba8R6LXDUefQy2eYrYjDq20yX/a9F39ekFPvOEX+SZeU43z+RiFrkhr0xljbxR8eMRK5xsgTITd/Kes/hXFsr0f9NduHSMz55Lg3kOOPGaKcbcZwHvH4m1WtGy4G3EHXtbUGfWak+uWbGNCmmLL9p0IGeX6HfwuYXDb3id+xTUsbyI51BLZm75Q/77xfcwd27Wev3JjvMrOfepNsrR7m/kPLuSHglCnsXrnZyHO69yT3zrAfpl4sIzl/o4nn3IG/n/nd1ZfE7OSOV/C6LtPkLesNZKzjE9PKMMK+F3emol3/O2ltiJfndpC8fymdahmueHAt/57LlXzzNpH547Wm6t5buUOu5BCzbx/XhKpQaxF9q53qXmcU6La+F5U+k2vrfyVvHdlFd7H/dwZzlmXtN/Rt5VJ/4vZMR1nq1ED3Dedaxn3zf5cs9W48l3AY+O8nevCOvneJ5kH/zH8RnuxQi+M+ldy76rXxnkWJ95gtjdIiuUOaGe/+eLvcznNZuBaYJZPNeK/wMz9dXw'
a=base64_decoder(b64_test.encode('utf-8'), '32f', 'zlib', 'intensity')
print(a)
print(base64_encoder(a[1],'zlib'))
print(time.time() - start)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment