changed
CHANGELOG.md
|
@@ -1,5 +1,11 @@
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+ ## v1.6.0 - 2024-10-22
|
4
|
+
|
5
|
+ * Require Elixir 1.12+.
|
6
|
+ * Introduce `Saxy.stream_events` (#118)
|
7
|
+ * Several bug fixes and compilation warnings
|
8
|
+
|
3
9
|
## v1.5.0 - 2022-11-09
|
4
10
|
|
5
11
|
* Saxy.encode! now supports "UTF-8"/"utf-8" as string in the prolog [#102](https://github.com/qcam/saxy/pull/102).
|
changed
README.md
|
@@ -23,7 +23,7 @@ Add `:saxy` to your `mix.exs`.
|
23
23
|
```elixir
|
24
24
|
def deps() do
|
25
25
|
[
|
26
|
- {:saxy, "~> 1.5"}
|
26
|
+ {:saxy, "~> 1.6"}
|
27
27
|
]
|
28
28
|
end
|
29
29
|
```
|
changed
hex_metadata.config
|
@@ -2,11 +2,11 @@
|
2
2
|
{<<"build_tools">>,[<<"mix">>]}.
|
3
3
|
{<<"description">>,
|
4
4
|
<<"Saxy is an XML parser and encoder in Elixir that focuses on speed and standard compliance.">>}.
|
5
|
- {<<"elixir">>,<<"~> 1.6">>}.
|
5
|
+ {<<"elixir">>,<<"~> 1.12">>}.
|
6
6
|
{<<"files">>,
|
7
|
- [<<"lib">>,<<"lib/saxy">>,<<"lib/saxy/prolog.ex">>,
|
8
|
- <<"lib/saxy/simple_form.ex">>,<<"lib/saxy/xml.ex">>,
|
9
|
- <<"lib/saxy/encoder.ex">>,<<"lib/saxy/guards.ex">>,
|
7
|
+ [<<"lib">>,<<"lib/saxy">>,<<"lib/saxy/prolog.ex">>,<<"lib/saxy/handler">>,
|
8
|
+ <<"lib/saxy/handler/accumulating.ex">>,<<"lib/saxy/simple_form.ex">>,
|
9
|
+ <<"lib/saxy/xml.ex">>,<<"lib/saxy/encoder.ex">>,<<"lib/saxy/guards.ex">>,
|
10
10
|
<<"lib/saxy/handler.ex">>,<<"lib/saxy/parser">>,
|
11
11
|
<<"lib/saxy/parser/lookahead.ex">>,<<"lib/saxy/parser/builder.ex">>,
|
12
12
|
<<"lib/saxy/parser/utils.ex">>,<<"lib/saxy/partial.ex">>,
|
|
@@ -22,4 +22,4 @@
|
22
22
|
{<<"GitHub">>,<<"https://github.com/qcam/saxy">>}]}.
|
23
23
|
{<<"name">>,<<"saxy">>}.
|
24
24
|
{<<"requirements">>,[]}.
|
25
|
- {<<"version">>,<<"1.5.0">>}.
|
25
|
+ {<<"version">>,<<"1.6.0">>}.
|
changed
lib/saxy.ex
|
@@ -111,8 +111,11 @@ defmodule Saxy do
|
111
111
|
|
112
112
|
"""
|
113
113
|
|
114
|
+ @compile {:inline, do_transform_stream: 4}
|
115
|
+
|
114
116
|
alias Saxy.{
|
115
117
|
Encoder,
|
118
|
+ Handler.Accumulating,
|
116
119
|
Parser,
|
117
120
|
State
|
118
121
|
}
|
|
@@ -309,10 +312,95 @@ defmodule Saxy do
|
309
312
|
end
|
310
313
|
|
311
314
|
defp reduce_stream(buffer, {cont_fun, state}) do
|
312
|
- with {:halted, cont_fun, state} <- cont_fun.(buffer, true, state) do
|
313
|
- {:cont, {cont_fun, state}}
|
314
|
- else
|
315
|
- other -> {:halt, other}
|
315
|
+ case cont_fun.(buffer, true, state) do
|
316
|
+ {:halted, cont_fun, state} ->
|
317
|
+ {:cont, {cont_fun, state}}
|
318
|
+
|
319
|
+ other ->
|
320
|
+ {:halt, other}
|
321
|
+ end
|
322
|
+ end
|
323
|
+
|
324
|
+ @doc """
|
325
|
+ Parses XML stream and returns a stream of elements.
|
326
|
+
|
327
|
+ This function takes a stream and returns a stream of xml SAX events.
|
328
|
+ When any parsing error occurs, it raises a `Saxy.ParseError` exception.
|
329
|
+
|
330
|
+
|
331
|
+ ## Examples
|
332
|
+
|
333
|
+ iex> stream = File.stream!("./test/support/fixture/foo.xml")
|
334
|
+ iex> Enum.to_list Saxy.stream_events stream
|
335
|
+ [
|
336
|
+ start_document: [version: "1.0"],
|
337
|
+ start_element: {"foo", [{"bar", "value"}]},
|
338
|
+ end_element: "foo"
|
339
|
+ ]
|
340
|
+ iex> Enum.to_list Saxy.stream_events ["<foo>unclosed value"]
|
341
|
+ ** (Saxy.ParseError) unexpected end of input, expected token: :chardata
|
342
|
+
|
343
|
+ > #### Warning {: .warning }
|
344
|
+ >
|
345
|
+ > Input stream is evaluated lazily, therefore some events may be emitted before
|
346
|
+ > exception is raised
|
347
|
+
|
348
|
+ ## Memory usage
|
349
|
+
|
350
|
+ `Saxy.stream_events/2` takes a `File.Stream` or `Stream` as the input, so the amount of bytes to buffer in each
|
351
|
+ chunk can be controlled by `File.stream!/3` API.
|
352
|
+
|
353
|
+ During parsing, the actual memory used by Saxy might be higher than the number configured for each chunk, since
|
354
|
+ Saxy holds in memory some parsed parts of the original binary to leverage Erlang sub-binary extracting. Anyway,
|
355
|
+ Saxy tries to free those up when it makes sense.
|
356
|
+
|
357
|
+ ### Options
|
358
|
+
|
359
|
+ See the “Shared options” section at the module documentation.
|
360
|
+
|
361
|
+ * `:character_data_max_length` - tells the parser to emit the `:characters` event when its length exceeds the specified
|
362
|
+ number. The option is useful when the tag being parsed containing a very large chunk of data. Defaults to `:infinity`.
|
363
|
+ """
|
364
|
+ @spec stream_events(in_stream :: Enumerable.t(), options :: Keyword.t()) :: out_stream :: Enumerable.t()
|
365
|
+ def stream_events(stream, options \\ []) do
|
366
|
+ expand_entity = Keyword.get(options, :expand_entity, :keep)
|
367
|
+ character_data_max_length = Keyword.get(options, :character_data_max_length, :infinity)
|
368
|
+ cdata_as_characters = Keyword.get(options, :cdata_as_characters, true)
|
369
|
+
|
370
|
+ state = %State{
|
371
|
+ prolog: nil,
|
372
|
+ handler: Accumulating,
|
373
|
+ user_state: [],
|
374
|
+ expand_entity: expand_entity,
|
375
|
+ cdata_as_characters: cdata_as_characters,
|
376
|
+ character_data_max_length: character_data_max_length
|
377
|
+ }
|
378
|
+
|
379
|
+ init = {&Parser.Stream.parse_prolog(&1, &2, &1, 0, &3), state}
|
380
|
+
|
381
|
+ stream
|
382
|
+ |> Stream.concat([:end_of_stream])
|
383
|
+ |> Stream.transform(init, &transform_stream/2)
|
384
|
+ end
|
385
|
+
|
386
|
+ defp transform_stream(:end_of_stream, {cont_fun, state}) do
|
387
|
+ do_transform_stream(<<>>, false, cont_fun, state)
|
388
|
+ end
|
389
|
+
|
390
|
+ defp transform_stream(buffer, {cont_fun, state}) do
|
391
|
+ do_transform_stream(buffer, true, cont_fun, state)
|
392
|
+ end
|
393
|
+
|
394
|
+ defp do_transform_stream(buffer, more?, cont_fun, state) do
|
395
|
+ case cont_fun.(buffer, more?, state) do
|
396
|
+ {:halted, cont_fun, %{user_state: user_state} = state} ->
|
397
|
+ {:lists.reverse(user_state), {cont_fun, %{state | user_state: []}}}
|
398
|
+
|
399
|
+ {:error, error} ->
|
400
|
+ raise error
|
401
|
+
|
402
|
+ other ->
|
403
|
+ {:halt, other}
|
316
404
|
end
|
317
405
|
end
|
318
406
|
|
|
@@ -353,7 +441,7 @@ defmodule Saxy do
|
353
441
|
iex> prolog = [version: "1.0"]
|
354
442
|
iex> Saxy.encode_to_iodata!(root, prolog)
|
355
443
|
[
|
356
|
- ['<?xml', [32, 'version', 61, 34, "1.0", 34], [], [], '?>'],
|
444
|
+ [~c'<?xml', [32, ~c'version', 61, 34, "1.0", 34], [], [], ~c'?>'],
|
357
445
|
[60, "foo", 32, "foo", 61, 34, "bar", 34],
|
358
446
|
62,
|
359
447
|
["bar"],
|
changed
lib/saxy/encoder.ex
|
@@ -9,7 +9,7 @@ defmodule Saxy.Encoder do
|
9
9
|
end
|
10
10
|
|
11
11
|
defp prolog(%Saxy.Prolog{} = prolog) do
|
12
|
- ['<?xml', version(prolog.version), encoding(prolog.encoding), standalone(prolog.standalone), '?>']
|
12
|
+ [~c"<?xml", version(prolog.version), encoding(prolog.encoding), standalone(prolog.standalone), ~c"?>"]
|
13
13
|
end
|
14
14
|
|
15
15
|
defp prolog(prolog) when is_list(prolog) do
|
|
@@ -21,23 +21,23 @@ defmodule Saxy.Encoder do
|
21
21
|
defp prolog(nil), do: []
|
22
22
|
|
23
23
|
defp version(version) when is_binary(version) do
|
24
|
- [?\s, 'version', ?=, ?", version, ?"]
|
24
|
+ [?\s, ~c"version", ?=, ?", version, ?"]
|
25
25
|
end
|
26
26
|
|
27
27
|
defp encoding(nil), do: []
|
28
28
|
|
29
29
|
defp encoding(:utf8) do
|
30
|
- [?\s, 'encoding', ?=, ?", 'utf-8', ?"]
|
30
|
+ [?\s, ~c"encoding", ?=, ?", ~c"utf-8", ?"]
|
31
31
|
end
|
32
32
|
|
33
33
|
defp encoding(encoding) when encoding in ["UTF-8", "utf-8"] do
|
34
|
- [?\s, 'encoding', ?=, ?", ~c(#{encoding}), ?"]
|
34
|
+ [?\s, ~c"encoding", ?=, ?", ~c(#{encoding}), ?"]
|
35
35
|
end
|
36
36
|
|
37
37
|
defp standalone(nil), do: []
|
38
38
|
|
39
39
|
defp standalone(true) do
|
40
|
- [?\s, 'standalone', ?=, ?", "yes", ?"]
|
40
|
+ [?\s, ~c"standalone", ?=, ?", "yes", ?"]
|
41
41
|
end
|
42
42
|
|
43
43
|
defp element({tag_name, attributes, []}) do
|
|
@@ -102,11 +102,11 @@ defmodule Saxy.Encoder do
|
102
102
|
end
|
103
103
|
|
104
104
|
@escapes [
|
105
|
- {?<, '<'},
|
106
|
- {?>, '>'},
|
107
|
- {?&, '&'},
|
108
|
- {?", '"'},
|
109
|
- {?', '''}
|
105
|
+ {?<, ~c"<"},
|
106
|
+ {?>, ~c">"},
|
107
|
+ {?&, ~c"&"},
|
108
|
+ {?", ~c"""},
|
109
|
+ {?', ~c"'"}
|
110
110
|
]
|
111
111
|
|
112
112
|
for {match, insert} <- @escapes do
|
|
@@ -124,7 +124,7 @@ defmodule Saxy.Encoder do
|
124
124
|
end
|
125
125
|
|
126
126
|
defp cdata(characters) do
|
127
|
- ['<![CDATA[', characters | ']]>']
|
127
|
+ [~c"<![CDATA[", characters | ~c"]]>"]
|
128
128
|
end
|
129
129
|
|
130
130
|
defp reference({:entity, reference}) do
|
|
@@ -140,7 +140,7 @@ defmodule Saxy.Encoder do
|
140
140
|
end
|
141
141
|
|
142
142
|
defp comment(comment) do
|
143
|
- ['<!--', escape_comment(comment, comment) | '-->']
|
143
|
+ [~c"<!--", escape_comment(comment, comment) | ~c"-->"]
|
144
144
|
end
|
145
145
|
|
146
146
|
defp escape_comment(<<?->>, original) do
|
|
@@ -156,6 +156,6 @@ defmodule Saxy.Encoder do
|
156
156
|
end
|
157
157
|
|
158
158
|
defp processing_instruction(name, content) do
|
159
|
- ['<?', name, ?\s, content | '?>']
|
159
|
+ [~c"<?", name, ?\s, content | ~c"?>"]
|
160
160
|
end
|
161
161
|
end
|
added
lib/saxy/handler/accumulating.ex
|
@@ -0,0 +1,11 @@
|
1
|
+ defmodule Saxy.Handler.Accumulating do
|
2
|
+ # Accumulating handler originally intended to be
|
3
|
+ # used with stream transformations
|
4
|
+ @moduledoc false
|
5
|
+
|
6
|
+ @behaviour Saxy.Handler
|
7
|
+
|
8
|
+ def handle_event(event, data, state) do
|
9
|
+ {:ok, [{event, data} | state]}
|
10
|
+ end
|
11
|
+ end
|
changed
lib/saxy/parser/builder.ex
|
@@ -27,6 +27,9 @@ defmodule Saxy.Parser.Builder do
|
27
27
|
|
28
28
|
defp prolog(<<buffer::bits>>, more?, original, pos, state) do
|
29
29
|
lookahead(buffer, @streaming) do
|
30
|
+ "<?xml-" <> _rest ->
|
31
|
+ prolog_misc(buffer, more?, original, pos, state, [])
|
32
|
+
|
30
33
|
"<?xml" <> rest ->
|
31
34
|
xml_decl(rest, more?, original, pos + 5, state)
|
32
35
|
|
|
@@ -107,7 +110,7 @@ defmodule Saxy.Parser.Builder do
|
107
110
|
|
108
111
|
encoding_decl(rest, more?, original, pos + len + 1, state, prolog)
|
109
112
|
|
110
|
- char <> rest when char in '0123456789' ->
|
113
|
+ char <> rest when char in ~c"0123456789" ->
|
111
114
|
xml_ver_num(rest, more?, original, pos, state, open_quote, len + 1)
|
112
115
|
|
113
116
|
_ in [""] when more? ->
|
|
@@ -851,7 +854,7 @@ defmodule Saxy.Parser.Builder do
|
851
854
|
open_tag_name(rest, more?, original, pos, state, Utils.compute_char_len(codepoint))
|
852
855
|
|
853
856
|
"/" <> rest ->
|
854
|
- close_tag_name(rest, more?, original, pos + 1, state, 0)
|
857
|
+ close_tag_name(rest, more?, original, pos + 1, state, 0, 0)
|
855
858
|
|
856
859
|
"![CDATA[" <> rest ->
|
857
860
|
element_cdata(rest, more?, original, pos + 8, state, 0)
|
|
@@ -1161,30 +1164,31 @@ defmodule Saxy.Parser.Builder do
|
1161
1164
|
end
|
1162
1165
|
end
|
1163
1166
|
|
1164
|
- defp close_tag_name(<<buffer::bits>>, more?, original, pos, state, 0) do
|
1167
|
+ defp close_tag_name(<<buffer::bits>>, more?, original, pos, state, 0, 0) do
|
1165
1168
|
lookahead buffer, @streaming do
|
1166
1169
|
char <> rest when is_ascii_name_start_char(char) ->
|
1167
|
- close_tag_name(rest, more?, original, pos, state, 1)
|
1170
|
+ close_tag_name(rest, more?, original, pos, state, 1, 1)
|
1168
1171
|
|
1169
1172
|
token in unquote(utf8_binaries()) when more? ->
|
1170
|
- halt!(close_tag_name(token, more?, original, pos, state, 0))
|
1173
|
+ halt!(close_tag_name(token, more?, original, pos, state, 0, 0))
|
1171
1174
|
|
1172
1175
|
<<codepoint::utf8>> <> rest when is_utf8_name_start_char(codepoint) ->
|
1173
|
- close_tag_name(rest, more?, original, pos, state, Utils.compute_char_len(codepoint))
|
1176
|
+ len = Utils.compute_char_len(codepoint)
|
1177
|
+ close_tag_name(rest, more?, original, pos, state, len, len)
|
1174
1178
|
|
1175
1179
|
_ in [""] when more? ->
|
1176
|
- halt!(close_tag_name("", more?, original, pos, state, 0))
|
1180
|
+ halt!(close_tag_name("", more?, original, pos, state, 0, 0))
|
1177
1181
|
|
1178
1182
|
_ ->
|
1179
1183
|
Utils.parse_error(original, pos, state, {:token, :end_tag})
|
1180
1184
|
end
|
1181
1185
|
end
|
1182
1186
|
|
1183
|
- defp close_tag_name(<<buffer::bits>>, more?, original, pos, state, len) do
|
1187
|
+ defp close_tag_name(<<buffer::bits>>, more?, original, pos, state, len, copy_to) do
|
1184
1188
|
lookahead buffer, @streaming do
|
1185
1189
|
">" <> rest ->
|
1186
1190
|
[open_tag | stack] = state.stack
|
1187
|
- ending_tag = binary_part(original, pos, len)
|
1191
|
+ ending_tag = binary_part(original, pos, copy_to)
|
1188
1192
|
pos = pos + len + 1
|
1189
1193
|
|
1190
1194
|
if open_tag == ending_tag do
|
|
@@ -1205,16 +1209,20 @@ defmodule Saxy.Parser.Builder do
|
1205
1209
|
end
|
1206
1210
|
|
1207
1211
|
char <> rest when is_ascii_name_char(char) ->
|
1208
|
- close_tag_name(rest, more?, original, pos, state, len + 1)
|
1212
|
+ close_tag_name(rest, more?, original, pos, state, len + 1, copy_to + 1)
|
1213
|
+
|
1214
|
+ char <> rest when is_whitespace(char) ->
|
1215
|
+ close_tag_name(rest, more?, original, pos, state, len + 1, copy_to)
|
1209
1216
|
|
1210
1217
|
token in unquote(utf8_binaries()) when more? ->
|
1211
|
- halt!(close_tag_name(token, more?, original, pos, state, len))
|
1218
|
+ halt!(close_tag_name(token, more?, original, pos, state, len, copy_to))
|
1212
1219
|
|
1213
1220
|
<<codepoint::utf8>> <> rest when is_utf8_name_char(codepoint) ->
|
1214
|
- close_tag_name(rest, more?, original, pos, state, len + Utils.compute_char_len(codepoint))
|
1221
|
+ char_len = Utils.compute_char_len(codepoint)
|
1222
|
+ close_tag_name(rest, more?, original, pos, state, len + char_len, copy_to + char_len)
|
1215
1223
|
|
1216
1224
|
_ in [""] when more? ->
|
1217
|
- halt!(close_tag_name("", more?, original, pos, state, len))
|
1225
|
+ halt!(close_tag_name("", more?, original, pos, state, len, copy_to))
|
1218
1226
|
|
1219
1227
|
_ ->
|
1220
1228
|
Utils.parse_error(original, pos + len, state, {:token, :end_tag})
|
changed
mix.exs
|
@@ -2,13 +2,13 @@ defmodule Saxy.MixProject do
|
2
2
|
use Mix.Project
|
3
3
|
|
4
4
|
@source_url "https://github.com/qcam/saxy"
|
5
|
- @version "1.5.0"
|
5
|
+ @version "1.6.0"
|
6
6
|
|
7
7
|
def project() do
|
8
8
|
[
|
9
9
|
app: :saxy,
|
10
10
|
version: @version,
|
11
|
- elixir: "~> 1.6",
|
11
|
+ elixir: "~> 1.12",
|
12
12
|
name: "Saxy",
|
13
13
|
consolidate_protocols: Mix.env() != :test,
|
14
14
|
deps: deps(),
|
|
@@ -36,7 +36,7 @@ defmodule Saxy.MixProject do
|
36
36
|
defp deps() do
|
37
37
|
[
|
38
38
|
{:ex_doc, ">= 0.0.0", only: :dev, runtime: false},
|
39
|
- {:stream_data, "~> 0.5", only: [:dev, :test]}
|
39
|
+ {:stream_data, "~> 1.0", only: [:dev, :test]}
|
40
40
|
]
|
41
41
|
end
|