Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
MSCompress
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Yu Gao
MSCompress
Commits
b66f7dd4
Commit
b66f7dd4
authored
Sep 24, 2021
by
yugao@uic.edu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
tidy version of mzml_reader with splitter and decoder
parents
Pipeline
#238
failed with stages
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
133 additions
and
0 deletions
+133
-0
mzml_reader.py
mzml_reader.py
+121
-0
tester.py
tester.py
+12
-0
No files found.
mzml_reader.py
0 → 100644
View file @
b66f7dd4
import
struct
from
xml.etree
import
cElementTree
as
ET
import
zlib
import
base64
import
numpy
as
np
np_dtype_numtype
=
{
'i'
:
np
.
int32
,
'e'
:
np
.
single
,
'f'
:
np
.
float32
,
'q'
:
np
.
int64
,
'd'
:
np
.
float64
}
accession_dict
=
{
"MS:1000519"
:
"32i"
,
"MS:1000520"
:
"16e"
,
"MS:1000521"
:
"32f"
,
"MS:1000522"
:
"64i"
,
"MS:1000523"
:
"64d"
,
"MS:1000574"
:
"zlib"
,
"MS:1000576"
:
"no compression"
,
"MS:1000515"
:
"intensity"
,
"MS:1000514"
:
"mass"
}
data_type_dict
=
{
'data_number'
:
'0'
,
'data_encoding'
:
'32f'
,
'data_compression'
:
'no compression'
,
'data_type'
:
'mass'
}
def
base64_decoder
(
base64_data
,
number_fmt
,
compress_method
,
array_type
):
if
base64_data
is
not
None
:
num_type
=
np_dtype_numtype
[
number_fmt
[
-
1
]]
decode_base64
=
base64
.
decodebytes
(
base64_data
.
encode
(
'ascii'
))
if
compress_method
==
'zlib'
:
decode_base64
=
zlib
.
decompress
(
decode_base64
)
data
=
np
.
frombuffer
(
decode_base64
,
dtype
=
num_type
)
#if array_type == 'intensity':
# data = np.log2(np.where(data>0.00001, data, 0.00001)/np.linalg.norm(data)) # performs log only on intensity
else
:
data
=
np
.
array
([])
return
data
def
base64_encoder
(
number_array
:
np
.
ndarray
,
compress_method
:
str
):
byte_data
=
number_array
.
tobytes
()
if
compress_method
==
'zlib'
:
byte_data
=
zlib
.
compress
(
byte_data
)
return
base64
.
b64encode
(
byte_data
)
.
decode
(
'ascii'
)
.
replace
(
'
\n
'
,
''
)
def
mzml_splitter
(
mzml_file
:
str
):
# Generate file names
int_binary_file
=
mzml_file
.
replace
(
'.mzML'
,
'.bint'
)
mass_binary_file
=
mzml_file
.
replace
(
'.mzML'
,
'.bmass'
)
mzml_out
=
mzml_file
.
replace
(
'.mzML'
,
'.smzml'
)
# Open file pointers
mass_binary_out_fp
=
open
(
mass_binary_file
,
'wb'
)
int_binary_out_fp
=
open
(
int_binary_file
,
'wb'
)
mzml_out_fp
=
open
(
mzml_out
,
'w'
,
newline
=
'
\n
'
)
# Create empty list for data temporary storage
data_position
=
[]
# Iterate mzML file and get all data into data_position[]
for
event
,
elem
in
iter
(
ET
.
iterparse
(
mzml_file
,
events
=
(
'end'
,))):
if
elem
.
tag
.
endswith
(
'}cvParam'
):
if
elem
.
get
(
'accession'
)
.
endswith
((
'MS:1000521'
,
'MS:1000522'
,
'MS:1000523'
,
'MS:1000519'
,
'MS:1000520'
)):
# retrieves the datatype based on MS accession
data_type_dict
[
'data_encoding'
]
=
accession_dict
[
elem
.
get
(
'accession'
)]
if
elem
.
get
(
'accession'
)
.
endswith
((
'MS:1000576'
,
'MS:1000574'
)):
# retrieves the compression based on MS accession
data_type_dict
[
'data_compression'
]
=
accession_dict
[
elem
.
get
(
'accession'
)]
if
elem
.
get
(
'accession'
)
.
endswith
((
'MS:1000515'
,
'MS:1000514'
)):
# retrives array_type
data_type_dict
[
'data_type'
]
=
accession_dict
[
elem
.
get
(
'accession'
)]
elif
elem
.
tag
.
endswith
(
'}binary'
):
number_array
=
base64_decoder
(
elem
.
text
,
data_type_dict
[
'data_encoding'
],
data_type_dict
[
'data_compression'
],
data_type_dict
[
'data_type'
])
data_type_dict
[
'data_number'
]
=
str
(
len
(
number_array
))
data_position
.
append
(([
data_type_dict
[
'data_number'
],
data_type_dict
[
'data_encoding'
],
data_type_dict
[
'data_compression'
],
data_type_dict
[
'data_type'
]],
number_array
))
# Split mzml into smzml, binary mass and binary int
with
open
(
mzml_file
,
'r'
)
as
mzml_in_fp
:
i
=
0
for
line
in
mzml_in_fp
:
if
'<binary>'
in
line
and
'</binary>'
in
line
:
mzml_out_fp
.
write
(
line
.
split
(
'<binary>'
)[
0
])
mzml_out_fp
.
write
(
"<binary>$
%
s$</binary>
\n
"
%
'$'
.
join
(
data_position
[
i
][
0
]))
number_array
=
data_position
[
i
][
1
]
number_fmt
=
np_dtype_numtype
[
data_position
[
i
][
0
][
1
][
-
1
]]
if
data_position
[
i
][
0
][
3
]
==
'mass'
:
mass_binary_out_fp
.
write
(
number_array
.
astype
(
number_fmt
))
else
:
int_binary_out_fp
.
write
(
number_array
.
astype
(
number_fmt
))
i
+=
1
else
:
mzml_out_fp
.
write
(
line
)
# Close file pointers
mzml_out_fp
.
close
()
int_binary_out_fp
.
close
()
mass_binary_out_fp
.
close
()
def
mzml_decoder
(
smzml_file
,
bmass_file
,
bint_file
,
mzml_file
):
# Create file pointers
mass_in_fp
=
open
(
bmass_file
,
'rb'
)
int_in_fp
=
open
(
bint_file
,
'rb'
)
smzml_file
=
open
(
smzml_file
,
'r'
)
# Restore mzML file from smzml
with
open
(
mzml_file
,
'w'
,
newline
=
'
\n
'
)
as
f_out
:
for
line
in
smzml_file
:
if
'<binary>'
in
line
and
'</binary>'
in
line
:
f_out
.
write
(
line
.
split
(
'<binary>'
)[
0
])
_
,
data_num
,
data_fmt
,
data_compression
,
data_type
,
_
=
line
.
split
(
'$'
)
if
data_type
==
'mass'
:
number_array
=
np
.
frombuffer
(
mass_in_fp
.
read
(
int
(
int
(
data_fmt
[:
2
])
*
int
(
data_num
)
/
8
)),
np_dtype_numtype
[
data_fmt
[
-
1
]])
else
:
number_array
=
np
.
frombuffer
(
int_in_fp
.
read
(
int
(
int
(
data_fmt
[:
2
])
*
int
(
data_num
)
/
8
)),
np_dtype_numtype
[
data_fmt
[
-
1
]])
f_out
.
write
(
'<binary>
%
s</binary>
\n
'
%
base64_encoder
(
number_array
,
data_compression
))
else
:
f_out
.
write
(
line
)
# Close file pointers
mass_in_fp
.
close
()
int_in_fp
.
close
()
smzml_file
.
close
()
tester.py
0 → 100644
View file @
b66f7dd4
# mscompress-pub.tester created by bathy at 9/23/2021
import
mzml_reader
as
mzr
mzml_file
=
r'J:\mass_cloud\projects_folder\1000111\test\fusion_20200116_qjl_YLD_19.mzML'
smzml_file
=
r'J:\mass_cloud\projects_folder\1000111\test\test2.smzml'
bmass_file
=
r'J:\mass_cloud\projects_folder\1000111\test\test2.bmass'
bint_file
=
r'J:\mass_cloud\projects_folder\1000111\test\test2.bint'
mzml_out
=
r'J:\mass_cloud\projects_folder\1000111\test\test2.mzML'
mzr
.
mzml_splitter
(
mzml_file
)
mzr
.
mzml_decoder
(
smzml_file
,
bmass_file
,
bint_file
,
mzml_out
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment