Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
MSCompress
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Yu Gao
MSCompress
Commits
3a7ff6a9
Commit
3a7ff6a9
authored
Sep 25, 2021
by
yugao@uic.edu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
updated even faster splitter, binary data not completed.
parent
4ebff29b
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
74 additions
and
44 deletions
+74
-44
mzml_splitter.py
mzml_splitter.py
+74
-44
No files found.
mzml_splitter.py
View file @
3a7ff6a9
...
...
@@ -3,7 +3,7 @@ import mmap
import
os
import
numpy
as
np
import
time
import
base64
import
py
base64
import
zlib
from
xml.etree
import
cElementTree
as
ET
...
...
@@ -20,29 +20,45 @@ accession_dict = {"MS:1000519": "32i",
"MS:1000514"
:
"mass"
}
def
find_string
(
file_name
,
match_tag_start
,
match_tag_end
):
def
find_string
(
file_name
,
match_tag_start
,
match_tag_end
,
data_format
,
spec_no
):
start_time
=
time
.
time
()
file_size
=
os
.
path
.
getsize
(
file_name
)
encoded_start_tag
=
match_tag_start
.
encode
(
'utf-8'
)
encoded_end_tag
=
match_tag_end
.
encode
(
'utf-8'
)
len_start_tag
=
len
(
match_tag_start
)
len_end_tag
=
len
(
match_tag_end
)
data_positions
=
[]
with
open
(
file_name
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
mass_fmt
=
(
"$
%
s$"
%
'$'
.
join
([
data_format
[
'mass'
][
'data_encoding'
],
data_format
[
'mass'
][
'data_compression'
],
data_format
[
'mass'
][
'data_type'
]]))
.
encode
(
'utf-8'
)
int_fmt
=
(
"$
%
s$"
%
'$'
.
join
([
data_format
[
'intensity'
][
'data_encoding'
],
data_format
[
'intensity'
][
'data_compression'
],
data_format
[
'intensity'
][
'data_type'
]]))
.
encode
(
'utf-8'
)
i
=
0
with
open
(
file_name
,
'r'
,
encoding
=
'utf-8'
)
as
f
,
open
(
file_name
.
replace
(
'.mzML'
,
'.smzml'
),
'wb'
)
as
fo
:
# memory-map the file, size 0 means whole file
m
=
mmap
.
mmap
(
f
.
fileno
(),
0
,
access
=
mmap
.
ACCESS_READ
)
# prot argument is *nix only
last_end
=
0
while
True
:
start
=
m
.
find
(
match_tag_start
.
encode
(
'utf-8'
))
if
start
==-
1
:
if
i
==
spec_no
*
2
:
print
(
"total time used to position all b64 data:"
,
time
.
time
()
-
start_time
)
fo
.
write
(
m
.
read
(
file_size
-
m
.
tell
()))
return
data_positions
end
=
m
.
find
(
match_tag_end
.
encode
(
'utf-8'
))
data_positions
.
append
((
start
+
len
(
match_tag_start
),
end
))
m
.
seek
(
end
+
len
(
match_tag_end
))
start
=
m
.
find
(
encoded_start_tag
)
fo
.
write
(
m
.
read
(
start
+
len_start_tag
-
last_end
))
m
.
seek
(
start
)
end
=
m
.
find
(
encoded_end_tag
)
data_positions
.
append
((
start
+
len_start_tag
,
end
))
m
.
seek
(
end
+
len_end_tag
)
if
i
%
2
==
0
:
fo
.
write
(
mass_fmt
)
else
:
fo
.
write
(
int_fmt
)
fo
.
write
(
b
'</binary>'
)
last_end
=
end
+
len_end_tag
i
+=
1
def
base64_decoder
(
base64_data
:
bytes
,
number_fmt
,
compress_method
,
array_type
):
if
base64_data
is
not
None
:
num_type
=
np_dtype_numtype
[
number_fmt
[
-
1
]]
decode_base64
=
base64
.
decodebytes
(
base64_data
)
decode_base64
=
pybase64
.
_pybase64
.
b64decode_as_bytearray
(
base64_data
)
if
compress_method
==
'zlib'
:
decode_base64
=
zlib
.
decompress
(
decode_base64
)
data
=
np
.
frombuffer
(
decode_base64
,
dtype
=
num_type
)
...
...
@@ -63,13 +79,14 @@ def mzml_splitter(mzml_file: str):
# Open file pointers
mass_binary_out_fp
=
open
(
mass_binary_file
,
'wb'
)
int_binary_out_fp
=
open
(
int_binary_file
,
'wb'
)
mzml_out_fp
=
open
(
mzml_out
,
'w'
,
newline
=
'
\n
'
,
encoding
=
'utf-8'
)
#
mzml_out_fp = open(mzml_out, 'w', newline='\n', encoding='utf-8')
data_format
=
{}
# Iterate mzML file and get all data into data_position[]
current_encoding
=
'32f'
current_compress
=
'no compression'
spec_no
=
0
for
event
,
elem
in
iter
(
ET
.
iterparse
(
mzml_file
,
events
=
(
'start'
,))):
if
elem
.
tag
.
endswith
(
'}cvParam'
):
if
elem
.
get
(
'accession'
)
.
endswith
((
'MS:1000521'
,
'MS:1000522'
,
'MS:1000523'
,
'MS:1000519'
,
'MS:1000520'
)):
# retrieves the datatype based on MS accession
...
...
@@ -81,42 +98,55 @@ def mzml_splitter(mzml_file: str):
data_format
[
current_type
]
=
{
'data_encoding'
:
current_encoding
,
'data_compression'
:
current_compress
,
'data_type'
:
current_type
}
elif
elem
.
tag
.
endswith
(
'}spectrumList'
):
spec_no
=
elem
.
get
(
'count'
)
print
(
spec_no
*
2
)
spec_no
=
int
(
elem
.
get
(
'count'
)
)
print
(
"Total binary data:"
,
spec_no
*
2
)
elif
len
(
data_format
.
keys
())
==
2
:
print
(
data_format
)
print
(
"Mass and intensity data format"
,
data_format
)
break
data_position
=
find_string
(
file_name
,
'<binary>'
,
'</binary>'
)
with
open
(
mzml_file
,
'r'
,
encoding
=
'utf-8'
)
as
mzml_in_fp
:
m
=
mmap
.
mmap
(
mzml_in_fp
.
fileno
(),
0
,
access
=
mmap
.
ACCESS_READ
)
i
=
0
for
line
in
mzml_in_fp
:
if
line
.
lstrip
()
.
startswith
(
'<binary>'
):
mzml_out_fp
.
write
(
line
.
split
(
'<binary>'
)[
0
])
if
i
%
2
==
0
:
fmt
=
data_format
[
'mass'
]
else
:
fmt
=
data_format
[
'intensity'
]
m
.
seek
(
data_position
[
i
][
0
])
b64_content_byte
=
m
.
read
(
data_position
[
i
][
1
]
-
data_position
[
i
][
0
])
number_array
=
base64_decoder
(
b64_content_byte
,
fmt
[
'data_encoding'
],
fmt
[
'data_compression'
],
fmt
[
'data_type'
])
mzml_out_fp
.
write
(
"<binary>$
%
s$</binary>
\n
"
%
'$'
.
join
([
str
(
len
(
number_array
)),
fmt
[
'data_encoding'
],
fmt
[
'data_compression'
],
fmt
[
'data_type'
]]))
number_fmt
=
np_dtype_numtype
[
fmt
[
'data_encoding'
][
-
1
]]
if
fmt
[
'data_type'
]
==
'mass'
:
mass_binary_out_fp
.
write
(
number_array
.
astype
(
number_fmt
))
else
:
int_binary_out_fp
.
write
(
number_array
.
astype
(
number_fmt
))
i
+=
1
else
:
mzml_out_fp
.
write
(
line
)
data_position
=
find_string
(
mzml_file
,
'<binary>'
,
'</binary>'
,
data_format
,
spec_no
)
# with open(mzml_file, 'r', encoding='utf-8') as mzml_in_fp:
# #m = mmap.mmap(mzml_in_fp.fileno(), 0, access=mmap.ACCESS_READ)
# i=0
# for line in mzml_in_fp:
# if line.lstrip().startswith('<binary>') and i< spec_no*2:
# mzml_out_fp.write(line.split('<binary>')[0])
#
# if i%2==0:
# fmt = data_format['mass']
# else:
# fmt = data_format['intensity']
#
# # m.seek(data_position[i][0])
# # b64_content_byte = m.read(data_position[i][1] - data_position[i][0])
# # number_array = base64_decoder(b64_content_byte, fmt['data_encoding'], fmt['data_compression'], fmt['data_type'])
# # number_fmt = np_dtype_numtype[fmt['data_encoding'][-1]]
# # if fmt['data_type'] == 'mass':
# # mass_binary_out_fp.write(number_array.astype(number_fmt))
# # else:
# # int_binary_out_fp.write(number_array.astype(number_fmt))
#
# mzml_out_fp.write("<binary>$%s$</binary>\n" % '$'.join([fmt['data_encoding'], fmt['data_compression'], fmt['data_type']]))
# i+=1
# else:
# mzml_out_fp.write(line)
# m = mmap.mmap(mzml_in_fp.fileno(), 0, access=mmap.ACCESS_READ)
# m.seek(data_position[i][0])
# b64_content_byte = m.read(data_position[i][1] - data_position[i][0])
# number_array = base64_decoder(b64_content_byte, fmt['data_encoding'], fmt['data_compression'], fmt['data_type'])
# number_fmt = np_dtype_numtype[fmt['data_encoding'][-1]]
# if fmt['data_type'] == 'mass':
# mass_binary_out_fp.write(number_array.astype(number_fmt))
# else:
# int_binary_out_fp.write(number_array.astype(number_fmt))
# Close file pointers
mzml_out_fp
.
close
()
#
mzml_out_fp.close()
int_binary_out_fp
.
close
()
mass_binary_out_fp
.
close
()
...
...
@@ -124,7 +154,7 @@ def mzml_splitter(mzml_file: str):
if
__name__
==
'__main__'
:
file_name
=
r'K:\test\test\fusion_20200116_qjl_YLD_19.mzML'
start
=
time
.
time
()
print
(
os
.
path
.
getsize
(
file_name
)
/
1e9
,
"GB"
)
print
(
mzml_splitter
(
file_name
)
)
print
(
"Total file size"
,
os
.
path
.
getsize
(
file_name
)
/
1e9
,
"GB"
)
mzml_splitter
(
file_name
)
print
(
time
.
time
()
-
start
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment