Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
MSCompress
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Yu Gao
MSCompress
Commits
97e8cad5
Commit
97e8cad5
authored
Sep 24, 2021
by
yugao@uic.edu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
updated encoding
parent
b66f7dd4
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
86 additions
and
7 deletions
+86
-7
mzml_reader.py
mzml_reader.py
+76
-2
tester.py
tester.py
+10
-5
No files found.
mzml_reader.py
View file @
97e8cad5
...
...
@@ -3,6 +3,7 @@ from xml.etree import cElementTree as ET
import
zlib
import
base64
import
numpy
as
np
import
time
np_dtype_numtype
=
{
'i'
:
np
.
int32
,
'e'
:
np
.
single
,
'f'
:
np
.
float32
,
'q'
:
np
.
int64
,
'd'
:
np
.
float64
}
...
...
@@ -36,6 +37,25 @@ def base64_decoder(base64_data, number_fmt, compress_method, array_type):
return
data
def
base64_lossy_decoder
(
base64_data
,
number_fmt
,
compress_method
,
array_type
):
if
base64_data
is
not
None
:
num_type
=
np_dtype_numtype
[
number_fmt
[
-
1
]]
decode_base64
=
base64
.
decodebytes
(
base64_data
.
encode
(
'ascii'
))
if
compress_method
==
'zlib'
:
decode_base64
=
zlib
.
decompress
(
decode_base64
)
data
=
np
.
frombuffer
(
decode_base64
,
dtype
=
num_type
)
if
array_type
==
'intensity'
:
data
=
data
/
np
.
linalg
.
norm
(
data
)
*
65535
data
=
(
np
.
round
(
np
.
log2
(
np
.
where
(
data
>
0.00001
,
data
,
0.00001
)),
4
)
*
1000
)
.
astype
(
np
.
ushort
)
# performs log only on intensity
else
:
basemass
=
int
(
data
[
0
]
*
10000
)
data
=
(
np
.
diff
(
data
)
*
10000
)
.
astype
(
np
.
int32
)
data
=
np
.
insert
(
data
,
0
,
basemass
)
else
:
data
=
np
.
array
([])
return
data
def
base64_encoder
(
number_array
:
np
.
ndarray
,
compress_method
:
str
):
byte_data
=
number_array
.
tobytes
()
if
compress_method
==
'zlib'
:
...
...
@@ -44,6 +64,7 @@ def base64_encoder(number_array: np.ndarray, compress_method: str):
def
mzml_splitter
(
mzml_file
:
str
):
start
=
time
.
time
()
# Generate file names
int_binary_file
=
mzml_file
.
replace
(
'.mzML'
,
'.bint'
)
mass_binary_file
=
mzml_file
.
replace
(
'.mzML'
,
'.bmass'
)
...
...
@@ -52,7 +73,7 @@ def mzml_splitter(mzml_file: str):
# Open file pointers
mass_binary_out_fp
=
open
(
mass_binary_file
,
'wb'
)
int_binary_out_fp
=
open
(
int_binary_file
,
'wb'
)
mzml_out_fp
=
open
(
mzml_out
,
'w'
,
newline
=
'
\n
'
)
mzml_out_fp
=
open
(
mzml_out
,
'w'
,
newline
=
'
\n
'
,
encoding
=
'utf-8'
)
# Create empty list for data temporary storage
data_position
=
[]
...
...
@@ -73,7 +94,7 @@ def mzml_splitter(mzml_file: str):
data_position
.
append
(([
data_type_dict
[
'data_number'
],
data_type_dict
[
'data_encoding'
],
data_type_dict
[
'data_compression'
],
data_type_dict
[
'data_type'
]],
number_array
))
# Split mzml into smzml, binary mass and binary int
with
open
(
mzml_file
,
'r'
)
as
mzml_in_fp
:
with
open
(
mzml_file
,
'r'
,
encoding
=
'utf-8'
)
as
mzml_in_fp
:
i
=
0
for
line
in
mzml_in_fp
:
if
'<binary>'
in
line
and
'</binary>'
in
line
:
...
...
@@ -94,6 +115,59 @@ def mzml_splitter(mzml_file: str):
int_binary_out_fp
.
close
()
mass_binary_out_fp
.
close
()
print
(
time
.
time
()
-
start
)
def
mzml_lossy_splitter
(
mzml_file
:
str
):
# Generate file names
int_binary_file
=
mzml_file
.
replace
(
'.mzML'
,
'.bint'
)
mass_binary_file
=
mzml_file
.
replace
(
'.mzML'
,
'.bmass'
)
mzml_out
=
mzml_file
.
replace
(
'.mzML'
,
'.smzml'
)
# Open file pointers
mass_binary_out_fp
=
open
(
mass_binary_file
,
'wb'
)
int_binary_out_fp
=
open
(
int_binary_file
,
'wb'
)
mzml_out_fp
=
open
(
mzml_out
,
'w'
,
newline
=
'
\n
'
)
# Create empty list for data temporary storage
data_position
=
[]
# Iterate mzML file and get all data into data_position[]
for
event
,
elem
in
iter
(
ET
.
iterparse
(
mzml_file
,
events
=
(
'end'
,))):
if
elem
.
tag
.
endswith
(
'}cvParam'
):
if
elem
.
get
(
'accession'
)
.
endswith
((
'MS:1000521'
,
'MS:1000522'
,
'MS:1000523'
,
'MS:1000519'
,
'MS:1000520'
)):
# retrieves the datatype based on MS accession
data_type_dict
[
'data_encoding'
]
=
accession_dict
[
elem
.
get
(
'accession'
)]
if
elem
.
get
(
'accession'
)
.
endswith
((
'MS:1000576'
,
'MS:1000574'
)):
# retrieves the compression based on MS accession
data_type_dict
[
'data_compression'
]
=
accession_dict
[
elem
.
get
(
'accession'
)]
if
elem
.
get
(
'accession'
)
.
endswith
((
'MS:1000515'
,
'MS:1000514'
)):
# retrives array_type
data_type_dict
[
'data_type'
]
=
accession_dict
[
elem
.
get
(
'accession'
)]
elif
elem
.
tag
.
endswith
(
'}binary'
):
number_array
=
base64_lossy_decoder
(
elem
.
text
,
data_type_dict
[
'data_encoding'
],
data_type_dict
[
'data_compression'
],
data_type_dict
[
'data_type'
])
data_type_dict
[
'data_number'
]
=
str
(
len
(
number_array
))
data_position
.
append
(([
data_type_dict
[
'data_number'
],
data_type_dict
[
'data_encoding'
],
data_type_dict
[
'data_compression'
],
data_type_dict
[
'data_type'
]],
number_array
))
# Split mzml into smzml, binary mass and binary int
with
open
(
mzml_file
,
'r'
)
as
mzml_in_fp
:
i
=
0
for
line
in
mzml_in_fp
:
if
'<binary>'
in
line
and
'</binary>'
in
line
:
mzml_out_fp
.
write
(
line
.
split
(
'<binary>'
)[
0
])
mzml_out_fp
.
write
(
"<binary>$
%
s_lossy$</binary>
\n
"
%
'$'
.
join
(
data_position
[
i
][
0
]))
number_array
=
data_position
[
i
][
1
]
if
data_position
[
i
][
0
][
3
]
==
'mass'
:
mass_binary_out_fp
.
write
(
number_array
.
astype
(
np
.
int32
))
else
:
int_binary_out_fp
.
write
(
number_array
.
astype
(
np
.
ushort
))
i
+=
1
else
:
mzml_out_fp
.
write
(
line
)
# Close file pointers
mzml_out_fp
.
close
()
int_binary_out_fp
.
close
()
mass_binary_out_fp
.
close
()
def
mzml_decoder
(
smzml_file
,
bmass_file
,
bint_file
,
mzml_file
):
# Create file pointers
...
...
tester.py
View file @
97e8cad5
# mscompress-pub.tester created by bathy at 9/23/2021
import
mzml_reader
as
mzr
mzml_file
=
r'
J:\mass_cloud\projects_folder\1000111\test
\fusion_20200116_qjl_YLD_19.mzML'
smzml_file
=
r'J:\mass_cloud\projects_folder\1000111\test\
test2
.smzml'
bmass_file
=
r'J:\mass_cloud\projects_folder\1000111\test\
test2
.bmass'
bint_file
=
r'J:\mass_cloud\projects_folder\1000111\test\
test2
.bint'
mzml_file
=
r'
K:\test\32bit_uncompressed_centroid
\fusion_20200116_qjl_YLD_19.mzML'
smzml_file
=
r'J:\mass_cloud\projects_folder\1000111\test\
fusion_20200116_qjl_YLD_19
.smzml'
bmass_file
=
r'J:\mass_cloud\projects_folder\1000111\test\
fusion_20200116_qjl_YLD_19
.bmass'
bint_file
=
r'J:\mass_cloud\projects_folder\1000111\test\
fusion_20200116_qjl_YLD_19
.bint'
mzml_out
=
r'J:\mass_cloud\projects_folder\1000111\test\test2.mzML'
# Split to lossless files
mzr
.
mzml_splitter
(
mzml_file
)
mzr
.
mzml_decoder
(
smzml_file
,
bmass_file
,
bint_file
,
mzml_out
)
# Split to lossy files
#mzr.mzml_lossy_splitter(mzml_file)
# Re-generate mzML file
#mzr.mzml_decoder(smzml_file, bmass_file, bint_file, mzml_out)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment