Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
MSCompress
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Yu Gao
MSCompress
Commits
4ebff29b
Commit
4ebff29b
authored
Sep 24, 2021
by
yugao@uic.edu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
updated faster splitter
parent
97e8cad5
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
143 additions
and
4 deletions
+143
-4
mzml_reader.py
mzml_reader.py
+12
-3
mzml_splitter.py
mzml_splitter.py
+130
-0
tester.py
tester.py
+1
-1
No files found.
mzml_reader.py
View file @
4ebff29b
...
...
@@ -4,6 +4,7 @@ import zlib
import
base64
import
numpy
as
np
import
time
import
os
np_dtype_numtype
=
{
'i'
:
np
.
int32
,
'e'
:
np
.
single
,
'f'
:
np
.
float32
,
'q'
:
np
.
int64
,
'd'
:
np
.
float64
}
...
...
@@ -76,8 +77,8 @@ def mzml_splitter(mzml_file: str):
mzml_out_fp
=
open
(
mzml_out
,
'w'
,
newline
=
'
\n
'
,
encoding
=
'utf-8'
)
# Create empty list for data temporary storage
data_
position
=
[]
data_
format
=
{}
data_position
=
[]
# Iterate mzML file and get all data into data_position[]
for
event
,
elem
in
iter
(
ET
.
iterparse
(
mzml_file
,
events
=
(
'end'
,))):
if
elem
.
tag
.
endswith
(
'}cvParam'
):
...
...
@@ -89,10 +90,11 @@ def mzml_splitter(mzml_file: str):
data_type_dict
[
'data_type'
]
=
accession_dict
[
elem
.
get
(
'accession'
)]
elif
elem
.
tag
.
endswith
(
'}binary'
):
number_array
=
base64_decoder
(
elem
.
text
,
data_type_dict
[
'data_encoding'
],
data_type_dict
[
'data_compression'
],
data_type_dict
[
'data_type'
])
number_array
=
base64_
lossy_
decoder
(
elem
.
text
,
data_type_dict
[
'data_encoding'
],
data_type_dict
[
'data_compression'
],
data_type_dict
[
'data_type'
])
data_type_dict
[
'data_number'
]
=
str
(
len
(
number_array
))
data_position
.
append
(([
data_type_dict
[
'data_number'
],
data_type_dict
[
'data_encoding'
],
data_type_dict
[
'data_compression'
],
data_type_dict
[
'data_type'
]],
number_array
))
# Split mzml into smzml, binary mass and binary int
with
open
(
mzml_file
,
'r'
,
encoding
=
'utf-8'
)
as
mzml_in_fp
:
i
=
0
...
...
@@ -193,3 +195,10 @@ def mzml_decoder(smzml_file, bmass_file, bint_file, mzml_file):
mass_in_fp
.
close
()
int_in_fp
.
close
()
smzml_file
.
close
()
if
__name__
==
'__main__'
:
file_name
=
r'K:\test\test\fusion_20200116_qjl_YLD_19.mzML'
start
=
time
.
time
()
print
(
os
.
path
.
getsize
(
file_name
)
/
1e9
,
"GB"
)
print
(
time
.
time
()
-
start
)
\ No newline at end of file
mzml_splitter.py
0 → 100644
View file @
4ebff29b
# mscompress-pub.mzml_splitter created by bathy at 9/24/2021
import
mmap
import
os
import
numpy
as
np
import
time
import
base64
import
zlib
from
xml.etree
import
cElementTree
as
ET
np_dtype_numtype
=
{
'i'
:
np
.
int32
,
'e'
:
np
.
single
,
'f'
:
np
.
float32
,
'q'
:
np
.
int64
,
'd'
:
np
.
float64
}
accession_dict
=
{
"MS:1000519"
:
"32i"
,
"MS:1000520"
:
"16e"
,
"MS:1000521"
:
"32f"
,
"MS:1000522"
:
"64i"
,
"MS:1000523"
:
"64d"
,
"MS:1000574"
:
"zlib"
,
"MS:1000576"
:
"no compression"
,
"MS:1000515"
:
"intensity"
,
"MS:1000514"
:
"mass"
}
def
find_string
(
file_name
,
match_tag_start
,
match_tag_end
):
start_time
=
time
.
time
()
data_positions
=
[]
with
open
(
file_name
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
# memory-map the file, size 0 means whole file
m
=
mmap
.
mmap
(
f
.
fileno
(),
0
,
access
=
mmap
.
ACCESS_READ
)
# prot argument is *nix only
while
True
:
start
=
m
.
find
(
match_tag_start
.
encode
(
'utf-8'
))
if
start
==-
1
:
print
(
"total time used to position all b64 data:"
,
time
.
time
()
-
start_time
)
return
data_positions
end
=
m
.
find
(
match_tag_end
.
encode
(
'utf-8'
))
data_positions
.
append
((
start
+
len
(
match_tag_start
),
end
))
m
.
seek
(
end
+
len
(
match_tag_end
))
def
base64_decoder
(
base64_data
:
bytes
,
number_fmt
,
compress_method
,
array_type
):
if
base64_data
is
not
None
:
num_type
=
np_dtype_numtype
[
number_fmt
[
-
1
]]
decode_base64
=
base64
.
decodebytes
(
base64_data
)
if
compress_method
==
'zlib'
:
decode_base64
=
zlib
.
decompress
(
decode_base64
)
data
=
np
.
frombuffer
(
decode_base64
,
dtype
=
num_type
)
#if array_type == 'intensity':
# data = np.log2(np.where(data>0.00001, data, 0.00001)/np.linalg.norm(data)) # performs log only on intensity
else
:
data
=
np
.
array
([])
return
data
def
mzml_splitter
(
mzml_file
:
str
):
start
=
time
.
time
()
# Generate file names
int_binary_file
=
mzml_file
.
replace
(
'.mzML'
,
'.bint'
)
mass_binary_file
=
mzml_file
.
replace
(
'.mzML'
,
'.bmass'
)
mzml_out
=
mzml_file
.
replace
(
'.mzML'
,
'.smzml'
)
# Open file pointers
mass_binary_out_fp
=
open
(
mass_binary_file
,
'wb'
)
int_binary_out_fp
=
open
(
int_binary_file
,
'wb'
)
mzml_out_fp
=
open
(
mzml_out
,
'w'
,
newline
=
'
\n
'
,
encoding
=
'utf-8'
)
data_format
=
{}
# Iterate mzML file and get all data into data_position[]
current_encoding
=
'32f'
current_compress
=
'no compression'
for
event
,
elem
in
iter
(
ET
.
iterparse
(
mzml_file
,
events
=
(
'start'
,))):
if
elem
.
tag
.
endswith
(
'}cvParam'
):
if
elem
.
get
(
'accession'
)
.
endswith
((
'MS:1000521'
,
'MS:1000522'
,
'MS:1000523'
,
'MS:1000519'
,
'MS:1000520'
)):
# retrieves the datatype based on MS accession
current_encoding
=
accession_dict
[
elem
.
get
(
'accession'
)]
if
elem
.
get
(
'accession'
)
.
endswith
((
'MS:1000576'
,
'MS:1000574'
)):
# retrieves the compression based on MS accession
current_compress
=
accession_dict
[
elem
.
get
(
'accession'
)]
if
elem
.
get
(
'accession'
)
.
endswith
((
'MS:1000515'
,
'MS:1000514'
)):
# retrives array_type
current_type
=
accession_dict
[
elem
.
get
(
'accession'
)]
data_format
[
current_type
]
=
{
'data_encoding'
:
current_encoding
,
'data_compression'
:
current_compress
,
'data_type'
:
current_type
}
elif
elem
.
tag
.
endswith
(
'}spectrumList'
):
spec_no
=
elem
.
get
(
'count'
)
print
(
spec_no
*
2
)
elif
len
(
data_format
.
keys
())
==
2
:
print
(
data_format
)
break
data_position
=
find_string
(
file_name
,
'<binary>'
,
'</binary>'
)
with
open
(
mzml_file
,
'r'
,
encoding
=
'utf-8'
)
as
mzml_in_fp
:
m
=
mmap
.
mmap
(
mzml_in_fp
.
fileno
(),
0
,
access
=
mmap
.
ACCESS_READ
)
i
=
0
for
line
in
mzml_in_fp
:
if
line
.
lstrip
()
.
startswith
(
'<binary>'
):
mzml_out_fp
.
write
(
line
.
split
(
'<binary>'
)[
0
])
if
i
%
2
==
0
:
fmt
=
data_format
[
'mass'
]
else
:
fmt
=
data_format
[
'intensity'
]
m
.
seek
(
data_position
[
i
][
0
])
b64_content_byte
=
m
.
read
(
data_position
[
i
][
1
]
-
data_position
[
i
][
0
])
number_array
=
base64_decoder
(
b64_content_byte
,
fmt
[
'data_encoding'
],
fmt
[
'data_compression'
],
fmt
[
'data_type'
])
mzml_out_fp
.
write
(
"<binary>$
%
s$</binary>
\n
"
%
'$'
.
join
([
str
(
len
(
number_array
)),
fmt
[
'data_encoding'
],
fmt
[
'data_compression'
],
fmt
[
'data_type'
]]))
number_fmt
=
np_dtype_numtype
[
fmt
[
'data_encoding'
][
-
1
]]
if
fmt
[
'data_type'
]
==
'mass'
:
mass_binary_out_fp
.
write
(
number_array
.
astype
(
number_fmt
))
else
:
int_binary_out_fp
.
write
(
number_array
.
astype
(
number_fmt
))
i
+=
1
else
:
mzml_out_fp
.
write
(
line
)
# Close file pointers
mzml_out_fp
.
close
()
int_binary_out_fp
.
close
()
mass_binary_out_fp
.
close
()
if
__name__
==
'__main__'
:
file_name
=
r'K:\test\test\fusion_20200116_qjl_YLD_19.mzML'
start
=
time
.
time
()
print
(
os
.
path
.
getsize
(
file_name
)
/
1e9
,
"GB"
)
print
(
mzml_splitter
(
file_name
))
print
(
time
.
time
()
-
start
)
tester.py
View file @
4ebff29b
# mscompress-pub.tester created by bathy at 9/23/2021
import
mzml_reader
as
mzr
mzml_file
=
r'K:\test\
32bit_uncompressed_centroid
\fusion_20200116_qjl_YLD_19.mzML'
mzml_file
=
r'K:\test\
64bit_zlib_profile
\fusion_20200116_qjl_YLD_19.mzML'
smzml_file
=
r'J:\mass_cloud\projects_folder\1000111\test\fusion_20200116_qjl_YLD_19.smzml'
bmass_file
=
r'J:\mass_cloud\projects_folder\1000111\test\fusion_20200116_qjl_YLD_19.bmass'
bint_file
=
r'J:\mass_cloud\projects_folder\1000111\test\fusion_20200116_qjl_YLD_19.bint'
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment