changed CHANGELOG.md
 
@@ -1,5 +1,11 @@
1
1
# Changelog
2
2
3
+ ## v1.6.0 - 2024-10-22
4
+
5
+ * Require Elixir 1.12+.
6
+ * Introduce `Saxy.stream_events` (#118)
7
+ * Several bug fixes and compilation warnings
8
+
3
9
## v1.5.0 - 2022-11-09
4
10
5
11
* Saxy.encode! now supports "UTF-8"/"utf-8" as string in the prolog [#102](https://github.com/qcam/saxy/pull/102).
changed README.md
 
@@ -23,7 +23,7 @@ Add `:saxy` to your `mix.exs`.
23
23
```elixir
24
24
def deps() do
25
25
[
26
- {:saxy, "~> 1.5"}
26
+ {:saxy, "~> 1.6"}
27
27
]
28
28
end
29
29
```
changed hex_metadata.config
 
@@ -2,11 +2,11 @@
2
2
{<<"build_tools">>,[<<"mix">>]}.
3
3
{<<"description">>,
4
4
<<"Saxy is an XML parser and encoder in Elixir that focuses on speed and standard compliance.">>}.
5
- {<<"elixir">>,<<"~> 1.6">>}.
5
+ {<<"elixir">>,<<"~> 1.12">>}.
6
6
{<<"files">>,
7
- [<<"lib">>,<<"lib/saxy">>,<<"lib/saxy/prolog.ex">>,
8
- <<"lib/saxy/simple_form.ex">>,<<"lib/saxy/xml.ex">>,
9
- <<"lib/saxy/encoder.ex">>,<<"lib/saxy/guards.ex">>,
7
+ [<<"lib">>,<<"lib/saxy">>,<<"lib/saxy/prolog.ex">>,<<"lib/saxy/handler">>,
8
+ <<"lib/saxy/handler/accumulating.ex">>,<<"lib/saxy/simple_form.ex">>,
9
+ <<"lib/saxy/xml.ex">>,<<"lib/saxy/encoder.ex">>,<<"lib/saxy/guards.ex">>,
10
10
<<"lib/saxy/handler.ex">>,<<"lib/saxy/parser">>,
11
11
<<"lib/saxy/parser/lookahead.ex">>,<<"lib/saxy/parser/builder.ex">>,
12
12
<<"lib/saxy/parser/utils.ex">>,<<"lib/saxy/partial.ex">>,
 
@@ -22,4 +22,4 @@
22
22
{<<"GitHub">>,<<"https://github.com/qcam/saxy">>}]}.
23
23
{<<"name">>,<<"saxy">>}.
24
24
{<<"requirements">>,[]}.
25
- {<<"version">>,<<"1.5.0">>}.
25
+ {<<"version">>,<<"1.6.0">>}.
changed lib/saxy.ex
 
@@ -111,8 +111,11 @@ defmodule Saxy do
111
111
112
112
"""
113
113
114
+ @compile {:inline, do_transform_stream: 4}
115
+
114
116
alias Saxy.{
115
117
Encoder,
118
+ Handler.Accumulating,
116
119
Parser,
117
120
State
118
121
}
 
@@ -309,10 +312,95 @@ defmodule Saxy do
309
312
end
310
313
311
314
defp reduce_stream(buffer, {cont_fun, state}) do
312
- with {:halted, cont_fun, state} <- cont_fun.(buffer, true, state) do
313
- {:cont, {cont_fun, state}}
314
- else
315
- other -> {:halt, other}
315
+ case cont_fun.(buffer, true, state) do
316
+ {:halted, cont_fun, state} ->
317
+ {:cont, {cont_fun, state}}
318
+
319
+ other ->
320
+ {:halt, other}
321
+ end
322
+ end
323
+
324
+ @doc """
325
+ Parses XML stream and returns a stream of elements.
326
+
327
+ This function takes a stream and returns a stream of xml SAX events.
328
+ When any parsing error occurs, it raises a `Saxy.ParseError` exception.
329
+
330
+
331
+ ## Examples
332
+
333
+ iex> stream = File.stream!("./test/support/fixture/foo.xml")
334
+ iex> Enum.to_list Saxy.stream_events stream
335
+ [
336
+ start_document: [version: "1.0"],
337
+ start_element: {"foo", [{"bar", "value"}]},
338
+ end_element: "foo"
339
+ ]
340
+ iex> Enum.to_list Saxy.stream_events ["<foo>unclosed value"]
341
+ ** (Saxy.ParseError) unexpected end of input, expected token: :chardata
342
+
343
+ > #### Warning {: .warning }
344
+ >
345
+ > Input stream is evaluated lazily, therefore some events may be emitted before
346
+ > exception is raised
347
+
348
+ ## Memory usage
349
+
350
+ `Saxy.stream_events/2` takes a `File.Stream` or `Stream` as the input, so the amount of bytes to buffer in each
351
+ chunk can be controlled by `File.stream!/3` API.
352
+
353
+ During parsing, the actual memory used by Saxy might be higher than the number configured for each chunk, since
354
+ Saxy holds in memory some parsed parts of the original binary to leverage Erlang sub-binary extracting. Anyway,
355
+ Saxy tries to free those up when it makes sense.
356
+
357
+ ### Options
358
+
359
+ See the “Shared options” section at the module documentation.
360
+
361
+ * `:character_data_max_length` - tells the parser to emit the `:characters` event when its length exceeds the specified
362
+ number. The option is useful when the tag being parsed containing a very large chunk of data. Defaults to `:infinity`.
363
+ """
364
+ @spec stream_events(in_stream :: Enumerable.t(), options :: Keyword.t()) :: out_stream :: Enumerable.t()
365
+ def stream_events(stream, options \\ []) do
366
+ expand_entity = Keyword.get(options, :expand_entity, :keep)
367
+ character_data_max_length = Keyword.get(options, :character_data_max_length, :infinity)
368
+ cdata_as_characters = Keyword.get(options, :cdata_as_characters, true)
369
+
370
+ state = %State{
371
+ prolog: nil,
372
+ handler: Accumulating,
373
+ user_state: [],
374
+ expand_entity: expand_entity,
375
+ cdata_as_characters: cdata_as_characters,
376
+ character_data_max_length: character_data_max_length
377
+ }
378
+
379
+ init = {&Parser.Stream.parse_prolog(&1, &2, &1, 0, &3), state}
380
+
381
+ stream
382
+ |> Stream.concat([:end_of_stream])
383
+ |> Stream.transform(init, &transform_stream/2)
384
+ end
385
+
386
+ defp transform_stream(:end_of_stream, {cont_fun, state}) do
387
+ do_transform_stream(<<>>, false, cont_fun, state)
388
+ end
389
+
390
+ defp transform_stream(buffer, {cont_fun, state}) do
391
+ do_transform_stream(buffer, true, cont_fun, state)
392
+ end
393
+
394
+ defp do_transform_stream(buffer, more?, cont_fun, state) do
395
+ case cont_fun.(buffer, more?, state) do
396
+ {:halted, cont_fun, %{user_state: user_state} = state} ->
397
+ {:lists.reverse(user_state), {cont_fun, %{state | user_state: []}}}
398
+
399
+ {:error, error} ->
400
+ raise error
401
+
402
+ other ->
403
+ {:halt, other}
316
404
end
317
405
end
318
406
 
@@ -353,7 +441,7 @@ defmodule Saxy do
353
441
iex> prolog = [version: "1.0"]
354
442
iex> Saxy.encode_to_iodata!(root, prolog)
355
443
[
356
- ['<?xml', [32, 'version', 61, 34, "1.0", 34], [], [], '?>'],
444
+ [~c'<?xml', [32, ~c'version', 61, 34, "1.0", 34], [], [], ~c'?>'],
357
445
[60, "foo", 32, "foo", 61, 34, "bar", 34],
358
446
62,
359
447
["bar"],
changed lib/saxy/encoder.ex
 
@@ -9,7 +9,7 @@ defmodule Saxy.Encoder do
9
9
end
10
10
11
11
defp prolog(%Saxy.Prolog{} = prolog) do
12
- ['<?xml', version(prolog.version), encoding(prolog.encoding), standalone(prolog.standalone), '?>']
12
+ [~c"<?xml", version(prolog.version), encoding(prolog.encoding), standalone(prolog.standalone), ~c"?>"]
13
13
end
14
14
15
15
defp prolog(prolog) when is_list(prolog) do
 
@@ -21,23 +21,23 @@ defmodule Saxy.Encoder do
21
21
defp prolog(nil), do: []
22
22
23
23
defp version(version) when is_binary(version) do
24
- [?\s, 'version', ?=, ?", version, ?"]
24
+ [?\s, ~c"version", ?=, ?", version, ?"]
25
25
end
26
26
27
27
defp encoding(nil), do: []
28
28
29
29
defp encoding(:utf8) do
30
- [?\s, 'encoding', ?=, ?", 'utf-8', ?"]
30
+ [?\s, ~c"encoding", ?=, ?", ~c"utf-8", ?"]
31
31
end
32
32
33
33
defp encoding(encoding) when encoding in ["UTF-8", "utf-8"] do
34
- [?\s, 'encoding', ?=, ?", ~c(#{encoding}), ?"]
34
+ [?\s, ~c"encoding", ?=, ?", ~c(#{encoding}), ?"]
35
35
end
36
36
37
37
defp standalone(nil), do: []
38
38
39
39
defp standalone(true) do
40
- [?\s, 'standalone', ?=, ?", "yes", ?"]
40
+ [?\s, ~c"standalone", ?=, ?", "yes", ?"]
41
41
end
42
42
43
43
defp element({tag_name, attributes, []}) do
 
@@ -102,11 +102,11 @@ defmodule Saxy.Encoder do
102
102
end
103
103
104
104
@escapes [
105
- {?<, '&lt;'},
106
- {?>, '&gt;'},
107
- {?&, '&amp;'},
108
- {?", '&quot;'},
109
- {?', '&apos;'}
105
+ {?<, ~c"&lt;"},
106
+ {?>, ~c"&gt;"},
107
+ {?&, ~c"&amp;"},
108
+ {?", ~c"&quot;"},
109
+ {?', ~c"&apos;"}
110
110
]
111
111
112
112
for {match, insert} <- @escapes do
 
@@ -124,7 +124,7 @@ defmodule Saxy.Encoder do
124
124
end
125
125
126
126
defp cdata(characters) do
127
- ['<![CDATA[', characters | ']]>']
127
+ [~c"<![CDATA[", characters | ~c"]]>"]
128
128
end
129
129
130
130
defp reference({:entity, reference}) do
 
@@ -140,7 +140,7 @@ defmodule Saxy.Encoder do
140
140
end
141
141
142
142
defp comment(comment) do
143
- ['<!--', escape_comment(comment, comment) | '-->']
143
+ [~c"<!--", escape_comment(comment, comment) | ~c"-->"]
144
144
end
145
145
146
146
defp escape_comment(<<?->>, original) do
 
@@ -156,6 +156,6 @@ defmodule Saxy.Encoder do
156
156
end
157
157
158
158
defp processing_instruction(name, content) do
159
- ['<?', name, ?\s, content | '?>']
159
+ [~c"<?", name, ?\s, content | ~c"?>"]
160
160
end
161
161
end
added lib/saxy/handler/accumulating.ex
 
@@ -0,0 +1,11 @@
1
+ defmodule Saxy.Handler.Accumulating do
2
+ # Accumulating handler originally intended to be
3
+ # used with stream transformations
4
+ @moduledoc false
5
+
6
+ @behaviour Saxy.Handler
7
+
8
+ def handle_event(event, data, state) do
9
+ {:ok, [{event, data} | state]}
10
+ end
11
+ end
changed lib/saxy/parser/builder.ex
 
@@ -27,6 +27,9 @@ defmodule Saxy.Parser.Builder do
27
27
28
28
defp prolog(<<buffer::bits>>, more?, original, pos, state) do
29
29
lookahead(buffer, @streaming) do
30
+ "<?xml-" <> _rest ->
31
+ prolog_misc(buffer, more?, original, pos, state, [])
32
+
30
33
"<?xml" <> rest ->
31
34
xml_decl(rest, more?, original, pos + 5, state)
32
35
 
@@ -107,7 +110,7 @@ defmodule Saxy.Parser.Builder do
107
110
108
111
encoding_decl(rest, more?, original, pos + len + 1, state, prolog)
109
112
110
- char <> rest when char in '0123456789' ->
113
+ char <> rest when char in ~c"0123456789" ->
111
114
xml_ver_num(rest, more?, original, pos, state, open_quote, len + 1)
112
115
113
116
_ in [""] when more? ->
 
@@ -851,7 +854,7 @@ defmodule Saxy.Parser.Builder do
851
854
open_tag_name(rest, more?, original, pos, state, Utils.compute_char_len(codepoint))
852
855
853
856
"/" <> rest ->
854
- close_tag_name(rest, more?, original, pos + 1, state, 0)
857
+ close_tag_name(rest, more?, original, pos + 1, state, 0, 0)
855
858
856
859
"![CDATA[" <> rest ->
857
860
element_cdata(rest, more?, original, pos + 8, state, 0)
 
@@ -1161,30 +1164,31 @@ defmodule Saxy.Parser.Builder do
1161
1164
end
1162
1165
end
1163
1166
1164
- defp close_tag_name(<<buffer::bits>>, more?, original, pos, state, 0) do
1167
+ defp close_tag_name(<<buffer::bits>>, more?, original, pos, state, 0, 0) do
1165
1168
lookahead buffer, @streaming do
1166
1169
char <> rest when is_ascii_name_start_char(char) ->
1167
- close_tag_name(rest, more?, original, pos, state, 1)
1170
+ close_tag_name(rest, more?, original, pos, state, 1, 1)
1168
1171
1169
1172
token in unquote(utf8_binaries()) when more? ->
1170
- halt!(close_tag_name(token, more?, original, pos, state, 0))
1173
+ halt!(close_tag_name(token, more?, original, pos, state, 0, 0))
1171
1174
1172
1175
<<codepoint::utf8>> <> rest when is_utf8_name_start_char(codepoint) ->
1173
- close_tag_name(rest, more?, original, pos, state, Utils.compute_char_len(codepoint))
1176
+ len = Utils.compute_char_len(codepoint)
1177
+ close_tag_name(rest, more?, original, pos, state, len, len)
1174
1178
1175
1179
_ in [""] when more? ->
1176
- halt!(close_tag_name("", more?, original, pos, state, 0))
1180
+ halt!(close_tag_name("", more?, original, pos, state, 0, 0))
1177
1181
1178
1182
_ ->
1179
1183
Utils.parse_error(original, pos, state, {:token, :end_tag})
1180
1184
end
1181
1185
end
1182
1186
1183
- defp close_tag_name(<<buffer::bits>>, more?, original, pos, state, len) do
1187
+ defp close_tag_name(<<buffer::bits>>, more?, original, pos, state, len, copy_to) do
1184
1188
lookahead buffer, @streaming do
1185
1189
">" <> rest ->
1186
1190
[open_tag | stack] = state.stack
1187
- ending_tag = binary_part(original, pos, len)
1191
+ ending_tag = binary_part(original, pos, copy_to)
1188
1192
pos = pos + len + 1
1189
1193
1190
1194
if open_tag == ending_tag do
 
@@ -1205,16 +1209,20 @@ defmodule Saxy.Parser.Builder do
1205
1209
end
1206
1210
1207
1211
char <> rest when is_ascii_name_char(char) ->
1208
- close_tag_name(rest, more?, original, pos, state, len + 1)
1212
+ close_tag_name(rest, more?, original, pos, state, len + 1, copy_to + 1)
1213
+
1214
+ char <> rest when is_whitespace(char) ->
1215
+ close_tag_name(rest, more?, original, pos, state, len + 1, copy_to)
1209
1216
1210
1217
token in unquote(utf8_binaries()) when more? ->
1211
- halt!(close_tag_name(token, more?, original, pos, state, len))
1218
+ halt!(close_tag_name(token, more?, original, pos, state, len, copy_to))
1212
1219
1213
1220
<<codepoint::utf8>> <> rest when is_utf8_name_char(codepoint) ->
1214
- close_tag_name(rest, more?, original, pos, state, len + Utils.compute_char_len(codepoint))
1221
+ char_len = Utils.compute_char_len(codepoint)
1222
+ close_tag_name(rest, more?, original, pos, state, len + char_len, copy_to + char_len)
1215
1223
1216
1224
_ in [""] when more? ->
1217
- halt!(close_tag_name("", more?, original, pos, state, len))
1225
+ halt!(close_tag_name("", more?, original, pos, state, len, copy_to))
1218
1226
1219
1227
_ ->
1220
1228
Utils.parse_error(original, pos + len, state, {:token, :end_tag})
changed mix.exs
 
@@ -2,13 +2,13 @@ defmodule Saxy.MixProject do
2
2
use Mix.Project
3
3
4
4
@source_url "https://github.com/qcam/saxy"
5
- @version "1.5.0"
5
+ @version "1.6.0"
6
6
7
7
def project() do
8
8
[
9
9
app: :saxy,
10
10
version: @version,
11
- elixir: "~> 1.6",
11
+ elixir: "~> 1.12",
12
12
name: "Saxy",
13
13
consolidate_protocols: Mix.env() != :test,
14
14
deps: deps(),
 
@@ -36,7 +36,7 @@ defmodule Saxy.MixProject do
36
36
defp deps() do
37
37
[
38
38
{:ex_doc, ">= 0.0.0", only: :dev, runtime: false},
39
- {:stream_data, "~> 0.5", only: [:dev, :test]}
39
+ {:stream_data, "~> 1.0", only: [:dev, :test]}
40
40
]
41
41
end