Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
MSCompress
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Yu Gao
MSCompress
Commits
e6a7b13d
Commit
e6a7b13d
authored
Sep 27, 2021
by
yugao@uic.edu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added decoder function in mzml_splitter.py
Need to do: 1. multi-tread zlib?
parent
e4ccaf8e
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
84 additions
and
10 deletions
+84
-10
mzml_splitter.py
mzml_splitter.py
+84
-10
No files found.
mzml_splitter.py
View file @
e6a7b13d
...
...
@@ -7,6 +7,8 @@ import pybase64
import
zlib
from
xml.etree
import
cElementTree
as
ET
np_dtype_numtype
=
{
'i'
:
np
.
int32
,
'e'
:
np
.
single
,
'f'
:
np
.
float32
,
'q'
:
np
.
int64
,
'd'
:
np
.
float64
}
def
chunks
(
l
,
n
):
"""Yield successive n-sized chunks from l."""
...
...
@@ -17,12 +19,12 @@ def chunks(l, n):
def
decode_pos
(
data_position
,
mzml_fp
,
data_fmt
,
array_type
):
data_start
,
data_end
=
data_position
mzml_fp
.
seek
(
data_start
)
data
=
mzml_fp
.
read
(
data_end
-
data_start
)
return
base64_decoder
(
data
,
data_fmt
[
'data_encoding'
],
data_fmt
[
'data_compression'
],
array_type
)
data
=
mzml_fp
.
read
(
data_end
-
data_start
)
num_data
,
num_array
=
base64_decoder
(
data
,
data_fmt
[
'data_encoding'
],
data_fmt
[
'data_compression'
],
array_type
)
return
num_data
,
num_array
def
base64_decoder
(
base64_data
:
bytes
,
number_fmt
,
compress_method
,
array_type
):
np_dtype_numtype
=
{
'i'
:
np
.
int32
,
'e'
:
np
.
single
,
'f'
:
np
.
float32
,
'q'
:
np
.
int64
,
'd'
:
np
.
float64
}
if
base64_data
is
not
None
:
num_type
=
np_dtype_numtype
[
number_fmt
[
-
1
]]
decode_base64
=
pybase64
.
_pybase64
.
b64decode_as_bytearray
(
base64_data
)
...
...
@@ -32,8 +34,15 @@ def base64_decoder(base64_data: bytes, number_fmt, compress_method, array_type):
# if array_type == 'intensity':
# data = np.log2(np.where(data>0.00001, data, 0.00001)/np.linalg.norm(data)) # performs log only on intensity
else
:
data
=
np
.
array
([])
return
data
data
=
0
,
np
.
array
([])
return
len
(
data
),
data
def
base64_encoder
(
number_array
:
np
.
ndarray
,
compress_method
:
str
):
byte_data
=
number_array
.
tobytes
()
if
compress_method
==
'zlib'
:
byte_data
=
zlib
.
compress
(
byte_data
)
return
pybase64
.
b64encode
(
byte_data
)
.
decode
(
'ascii'
)
.
replace
(
'
\n
'
,
''
)
accession_dict
=
{
"MS:1000519"
:
"32i"
,
...
...
@@ -88,7 +97,7 @@ def mzml_splitter(mzml_file: str):
# Generate file names
int_binary_file
=
mzml_file
.
replace
(
'.mzML'
,
'.bint'
)
mass_binary_file
=
mzml_file
.
replace
(
'.mzML'
,
'.bmass'
)
#mzml_out = mzml_file.replace('.mzML', '.smzml')
#
mzml_out = mzml_file.replace('.mzML', '.smzml')
# Open file pointers
mass_binary_out_fp
=
open
(
mass_binary_file
,
'wb'
)
...
...
@@ -121,20 +130,85 @@ def mzml_splitter(mzml_file: str):
break
data_position
=
find_string
(
mzml_read_fp
,
'<binary>'
,
'</binary>'
,
data_format
,
spec_no
)
data_chunks
=
chunks
(
data_position
,
2
)
data_chunks
=
chunks
(
data_position
,
2
)
mass_num_data_list
=
[]
int_num_data_list
=
[]
for
each_data
in
data_chunks
:
mass_pos
,
int_pos
=
each_data
mass_binary_out_fp
.
write
(
decode_pos
(
mass_pos
,
mzml_read_fp
,
data_format
[
'mass'
],
'mass'
))
int_binary_out_fp
.
write
(
decode_pos
(
mass_pos
,
mzml_read_fp
,
data_format
[
'intensity'
],
'intensity'
))
mass_num_data
,
mass_data_to_write
=
decode_pos
(
mass_pos
,
mzml_read_fp
,
data_format
[
'mass'
],
'mass'
)
mass_binary_out_fp
.
write
(
mass_data_to_write
)
mass_num_data_list
.
append
(
mass_num_data
)
int_num_data
,
int_data_to_write
=
decode_pos
(
int_pos
,
mzml_read_fp
,
data_format
[
'intensity'
],
'intensity'
)
int_binary_out_fp
.
write
(
int_data_to_write
)
int_num_data_list
.
append
(
int_num_data
)
mass_num_data_list
.
append
(
len
(
mass_num_data_list
))
int_num_data_list
.
append
(
len
(
int_num_data_list
))
mass_binary_out_fp
.
write
(
np
.
array
(
mass_num_data_list
,
dtype
=
np
.
int32
)
.
tobytes
())
int_binary_out_fp
.
write
(
np
.
array
(
int_num_data_list
,
dtype
=
np
.
int32
)
.
tobytes
())
mzml_read_fp
.
close
()
int_binary_out_fp
.
close
()
mass_binary_out_fp
.
close
()
def
mzml_decoder
(
smzml_file
,
bmass_file
,
bint_file
,
mzml_file
):
# Create file pointers
mass_in_fp
=
open
(
bmass_file
,
'rb'
)
int_in_fp
=
open
(
bint_file
,
'rb'
)
smzml_file
=
open
(
smzml_file
,
'r'
,
encoding
=
'utf-8'
)
int_in_fp
.
seek
(
-
4
,
os
.
SEEK_END
)
total_spec_no
=
int
.
from_bytes
(
int_in_fp
.
read
(
4
),
byteorder
=
'little'
)
int_in_fp
.
seek
(
-
4
*
(
total_spec_no
+
1
),
os
.
SEEK_END
)
spec_no_array
=
np
.
frombuffer
(
int_in_fp
.
read
(
4
*
total_spec_no
),
np
.
int32
)
int_in_fp
.
seek
(
0
)
i
=
0
# Restore mzML file from smzml
with
open
(
mzml_file
,
'w'
,
newline
=
'
\n
'
,
encoding
=
'utf-8'
)
as
f_out
:
for
line
in
smzml_file
:
if
'<binary>'
in
line
and
i
<
total_spec_no
:
f_out
.
write
(
line
.
split
(
'<binary>'
)[
0
])
_
,
data_fmt
,
data_compression
,
data_type
,
_
=
line
.
split
(
'$'
)
data_num
=
spec_no_array
[
i
]
if
data_type
==
'mass'
:
number_array
=
np
.
frombuffer
(
mass_in_fp
.
read
(
int
(
int
(
data_fmt
[:
2
])
*
int
(
data_num
)
/
8
)),
np_dtype_numtype
[
data_fmt
[
-
1
]])
else
:
read_number
=
int
(
int
(
data_fmt
[:
2
])
*
int
(
data_num
)
/
8
)
number_array
=
np
.
frombuffer
(
int_in_fp
.
read
(
read_number
),
np_dtype_numtype
[
data_fmt
[
-
1
]])
i
+=
1
f_out
.
write
(
'<binary>
%
s</binary>
\n
'
%
base64_encoder
(
number_array
,
data_compression
))
else
:
f_out
.
write
(
line
)
# Close file pointers
mass_in_fp
.
close
()
int_in_fp
.
close
()
smzml_file
.
close
()
if
__name__
==
'__main__'
:
file_name
=
r'K:\test\test\fusion_20200116_qjl_YLD_19.mzML'
start
=
time
.
time
()
print
(
"Total file size"
,
os
.
path
.
getsize
(
file_name
)
/
1e9
,
"GB"
)
mzml_splitter
(
file_name
)
#mzml_splitter(file_name)
smzml_file
=
r'K:\test\test\fusion_20200116_qjl_YLD_19.smzml'
bmass_file
=
r'K:\test\test\fusion_20200116_qjl_YLD_19.bmass'
bint_file
=
r'K:\test\test\fusion_20200116_qjl_YLD_19.bint'
out_mzml
=
r'K:\test\test\fusion_20200116_qjl_YLD_19_out.mzML'
mzml_decoder
(
smzml_file
,
bmass_file
,
bint_file
,
out_mzml
)
b64_test
=
'eJyFmntUVPUWx2EEAyVUHHAUzEpuoshjCBEkQGDOIPciKg/HxIpUiMy0h4gX9Uq+QgMEJHwhKEvAfMVCuSpMmkKjiaHxSFLkWYpIvCR8wMy9a2qt1mLV5/T793u++3XOb+/9279jYMDLsWSvkrCQ/O8VhF13cEKe3frTiLWFjxQIW9Wbhjx1rg3yCspnITbipSb04Z9h3Ygt0jT/IbM1NSGQntOvxOypwtS2XwIIT7okoF9V63MRc1jojX51v3ADbZf1f4G8tUcExAa1LPNISAVidp3b0IengY2ob2pOEPuXegv1WXXZYqw3vGOCMpetmYbY3PxxiNkl2KO+wjHzkfe4Jwx5+mXimYxx21V1AuWuU7oj9vWjnzBufTcPIa+x4jbacjBuJmIr/leA+opcqhAb5++Ithz6x03kSayfIhZaXY7YlBwT9MFz8q+ISbQNaKfjds0QfY2HHPDZIHNjxGwNghFrW8t2r9iVLZi/O0Lo267DZ2ZLOP9tzAhEvd9reW9WrXhVmRfrh3jaFd5n9wwNEYsZxTXF+Eo28vRr39ymOYSNdeWac3OZrahcxSMJytUvj46M3/irnTNxrwfFZqBfY5uno/5c7++Q1zd/NPJ6Kwy4xtZyzTNOlKC+phSuv4Umd4bIdLfrQh01KTaoo0sjRx1XAl5Fno/kDPKsbHVoS5KfN8pc/Y4Hyvz8aBTy6u43K1KC+Ru/UH0SsdoQf8Q0NT74fc3buxvtaf/pNMqc9MNm5GVGKpFX7N4uuicSuq9jzM/uGCZaE40mWIrKNlWZoc1Rd3lPGN8xVF5dv11Ut34NXrBXatScq52udaBvSs0NhVsax7QoZDRiD6043lYjOUd9dIdtPTfYyrnfnXuI2nTehyXXucY5BE1E/yJrpMibFcr6OodPRKyjRsv96mfm+K5vx09CO+c4T0deY8xW/vaeWqKdKQkViP13/nOI5YeWInYgmvNa+su8RzaVcG4KTRyDmNsRK5SZ/HgS8la43GY7n/dCXtHMH5C35I4asaXhXPuaVefQB/06kcA53yCQz007kzne0S7DkXdKOhl5s9pNkSd/g30cPfkuxiZGXYa8Bdcs8LtP32KJdsZOb0Z9VTL+tjc6tiNPvxoS8+fs1bqK9mb6tXx/n2hOH+PK/c7xECn6lfbzRaG8eJOobPtKuWD2bQ7KGB5bgPb/57CRELduFOIe6mGIZUbwuSp3H8e11aYasbLwh6Lv43JHBM5A7rbI0NaaxwWi9Vy/FnYE4zNp0ftRdlPXfGFg8TbRd6Tb9gW+n9dvce9dZ8o5bYlVBMo0UW3lOvCsAWN8/uMZHMOwR8g7oOH8at5dGpCd7SS6h6IGbTB+GfN4/le3heeGx55vRp1XvRKVvpbcz2gPmgmyRT2od1oq58UP22/wHGpuAmLWpXIhynQl98KbfVDn2P05nFMDMxGL9Pzr3Pik/zXk+C8tQxtt9/B7VllMYBvnXcBY63bMHqIvIuj3fNXvx7Mpr4VnUV5wShLyzLVlyOty7kTsjepv+XznU8O+aesQ63fnPt0t5UfkJTtzr5oju4n7zFKt4O8kXY68sH7OCzrfpcrMvp6/9ON4zUiU+VYM14i8OaeFJKfhGJstUWGIxfXvE1Imco5rsB0QzVMuody7mY8wREy2g/uhNS3Jf1ubZBuM8Zn85tmC0dli0fpzxigX/VJr+tDudXu4/z3/I/eN1ZJv/sRzLuQ90nmJ59769Yt0mWiMBg+Wifr/ZX2iKF5VXIy+zCg5j7ZlvGTO91bpvehvfDjnkoYHzmjrtEv7UV/HBj/Eypt4v5SqLqJ/iz3uo53e8XwXs2sZn9Nu7eG53ImMB0Owhl6e950slyAWu4rnA08O8nk9yugw2n1f6oqY73EPxL4q1gnnNh4eYo97z2v4jrfGvch5OLec7zQNn3F9WvA+80pqUd++SD47nfisju+x/p01BMv3vYzP+up2c18wc3uANGQq4h/cMMY4lvjmoc+DT1oFCx2fs5OncI0wDDXgumz6DHlHVb9yH3v4nmh+Oro89G9nhg7BbqLPVBzjOdxbHjy7XXzqId8LvcJ7yb6Nv52+D8wQix7jgtjsnfaI+ZnoELOozEOsMolzjN/uuRjT0MFC0TmOzOuMKB7iwndqXwbwHE9WwXeW8gNtPKM1G1BkSU2xjna/Nx59tVk+A21decUaednHPkGea8gaxHba8R6LXDUefQy2eYrYjDq20yX/a9F39ekFPvOEX+SZeU43z+RiFrkhr0xljbxR8eMRK5xsgTITd/Kes/hXFsr0f9NduHSMz55Lg3kOOPGaKcbcZwHvH4m1WtGy4G3EHXtbUGfWak+uWbGNCmmLL9p0IGeX6HfwuYXDb3id+xTUsbyI51BLZm75Q/77xfcwd27Wev3JjvMrOfepNsrR7m/kPLuSHglCnsXrnZyHO69yT3zrAfpl4sIzl/o4nn3IG/n/nd1ZfE7OSOV/C6LtPkLesNZKzjE9PKMMK+F3emol3/O2ltiJfndpC8fymdahmueHAt/57LlXzzNpH547Wm6t5buUOu5BCzbx/XhKpQaxF9q53qXmcU6La+F5U+k2vrfyVvHdlFd7H/dwZzlmXtN/Rt5VJ/4vZMR1nq1ED3Dedaxn3zf5cs9W48l3AY+O8nevCOvneJ5kH/zH8RnuxQi+M+ldy76rXxnkWJ95gtjdIiuUOaGe/+eLvcznNZuBaYJZPNeK/wMz9dXw'
a
=
base64_decoder
(
b64_test
.
encode
(
'utf-8'
),
'32f'
,
'zlib'
,
'intensity'
)
print
(
a
)
print
(
base64_encoder
(
a
[
1
],
'zlib'
))
print
(
time
.
time
()
-
start
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment