From 4fe095884328295c458d22fde46373b185428a7d Mon Sep 17 00:00:00 2001 From: Alex Roper Date: Wed, 13 May 2020 18:16:22 -0700 Subject: [PATCH 1/3] Fix headline parsing leading to dropped lines. Currently, headline parsing breaks the file into lines before parsing headlines, stripping terminal \n or \r\n. This prevents parse_headline_level from differentiating between end of line and end of file. This can lead to an edge case where a line is considered a headline for the purposes of stopping parsing the body of the previous, yet not a headline itself. This leads to parsing stopping there. If the file is immediately written, this results in truncating it. One example of this is `"* \n*\r\n* \n"`, which will be parsed identically to `"* \n"`. --- src/parsers.rs | 126 +++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 111 insertions(+), 15 deletions(-) diff --git a/src/parsers.rs b/src/parsers.rs index d277057..5af8b9f 100644 --- a/src/parsers.rs +++ b/src/parsers.rs @@ -4,7 +4,12 @@ use std::marker::PhantomData; use indextree::{Arena, NodeId}; use jetscii::{bytes, BytesConst}; use memchr::{memchr, memchr_iter}; -use nom::bytes::complete::take_while1; +use nom::{ + bytes::complete::is_a, + character::complete::one_of, + combinator::{map, verify}, + IResult, +}; use crate::config::ParseConfig; use crate::elements::{ @@ -635,23 +640,114 @@ pub fn blank_lines_count(input: &str) -> (&str, usize) { crate::parse::combinators::blank_lines_count(input).unwrap_or((input, 0)) } -pub fn parse_headline(input: &str) -> Option<(&str, (&str, usize))> { - let (input_, level) = parse_headline_level(input)?; - let (input_, content) = lines_while(move |line| { - parse_headline_level(line) - .map(|(_, l)| l > level) - .unwrap_or(true) - })(input_) - .unwrap_or((input_, "")); - Some((input_, (&input[0..level + content.len()], level))) +// Matches a headline of level <= max_level. This will always be exactly one +// line, including the terminal \n if one is present. Unlike org-mode (but like +// org-element), we accept '\n' and EOF to terminate the stars. Returns the +// number of stars. Must only be called at the start of a line. +fn parse_headline_level_le(input: &str, max_level: usize) -> IResult<&str, usize, ()> { + let (input, level) = verify( + map(is_a("*"), |s: &str| s.chars().count()), + |level: &usize| *level <= max_level, + )(input)?; + if !input.is_empty() { + one_of("\n ")(input)?; + let (input, _) = line_length(input)?; + Ok((input, level)) + } else { + Ok((input, level)) + } } -pub fn parse_headline_level(input: &str) -> Option<(&str, usize)> { - let (input, stars) = take_while1::<_, _, ()>(|c: char| c == '*')(input).ok()?; +// Recognizes until end-of-line or end-of-input and returns the length of the +// line, including the terminal \n (or \r\n) if present. +fn line_length(input: &str) -> IResult<&str, usize, ()> { + match memchr(b'\n', input.as_bytes()) { + Some(index) => Ok((&input[index + 1..], index + 1)), + None => Ok(("", input.len())), + } +} + +pub fn parse_headline(input: &str) -> Option<(&str, (&str, usize))> { + // Consume the headline. + let (text, level) = parse_headline_level_le(input, std::usize::MAX).ok()?; + + // Collect lines until EOF or a headline. + let mut last = 0; + for i in memchr_iter(b'\n', text.as_bytes()) { + if parse_headline_level_le(&text[last..], level).is_ok() { + break; + } + + last = i + 1; + } - if input.starts_with(' ') || input.starts_with('\n') || input.is_empty() { - Some((input, stars.len())) + if last < text.len() && parse_headline_level_le(&text[last..], level).is_err() { + Some(("", (input, level))) } else { - None + Some(( + &text[last..], + (&input[..(input.len() - text.len()) + last], level), + )) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_headline() { + assert_eq!(parse_headline("*"), Some(("", ("*", 1)))); + assert_eq!(parse_headline("* "), Some(("", ("* ", 1)))); + assert_eq!(parse_headline("* \r"), Some(("", ("* \r", 1)))); + assert_eq!(parse_headline("*\t"), None); + assert_eq!(parse_headline("*\t\n"), None); + assert_eq!(parse_headline("* \n"), Some(("", ("* \n", 1)))); + assert_eq!(parse_headline("* \n\r*"), Some(("", ("* \n\r*", 1)))); + assert_eq!(parse_headline("* \n\r**"), Some(("", ("* \n\r**", 1)))); + assert_eq!(parse_headline("*\n*"), Some(("*", ("*\n", 1)))); + assert_eq!(parse_headline("*\n\n*"), Some(("*", ("*\n\n", 1)))); + assert_eq!(parse_headline("*\r"), None); + assert_eq!(parse_headline("* *"), Some(("", ("* *", 1)))); + assert_eq!(parse_headline("***\r** Hello\n"), None); + assert_eq!( + parse_headline("*** ** Hello\n"), + Some(("", ("*** ** Hello\n", 3))) + ); + assert_eq!(parse_headline("* Hello"), Some(("", ("* Hello", 1)))); + assert_eq!( + parse_headline("*** Hi\nWorld"), + Some(("", ("*** Hi\nWorld", 3))) + ); + + assert_eq!( + parse_headline("* Hello\nText\n** Test\n ** More text\n* World\n"), + Some(("* World\n", ("* Hello\nText\n** Test\n ** More text\n", 1))) + ); + + // We can parse a headline that contains the *\r\n. It is treated as + // text in the section. + assert_eq!( + parse_headline("* \n*\r\n* \n"), + Some(("* \n", ("* \n*\r\n", 1))) + ); + + // We can't parse a headline starting at *\r\n, thus ensuring that each + // line either is or is not a headline. + assert_eq!(parse_headline("*\r\n* \n"), None); + + assert_eq!(parse_headline("* \n"), Some(("", ("* \n", 1)))); + + assert_eq!( + parse_headline("* \n**\r\n* \n"), + Some(("* \n", ("* \n**\r\n", 1))) + ); + + assert_eq!(parse_headline("* a\n*"), Some(("*", ("* a\n", 1)))); + assert_eq!(parse_headline("* a\r\n*"), Some(("*", ("* a\r\n", 1)))); + assert_eq!(parse_headline("* a\r\n* b"), Some(("* b", ("* a\r\n", 1)))); + assert_eq!(parse_headline("* a\n* "), Some(("* ", ("* a\n", 1)))); + assert_eq!(parse_headline("* a\n* \n"), Some(("* \n", ("* a\n", 1)))); + assert_eq!(parse_headline("* a\n* \n"), Some(("* \n", ("* a\n", 1)))); } } From 9cf38d1dc5a28421fa7d43ddc6815e73314f6c2d Mon Sep 17 00:00:00 2001 From: Alex Roper Date: Thu, 14 May 2020 01:56:55 -0700 Subject: [PATCH 2/3] Add a fastpath for newlines that don't start with a star. --- src/parsers.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/parsers.rs b/src/parsers.rs index 5af8b9f..ffcd438 100644 --- a/src/parsers.rs +++ b/src/parsers.rs @@ -674,7 +674,8 @@ pub fn parse_headline(input: &str) -> Option<(&str, (&str, usize))> { // Collect lines until EOF or a headline. let mut last = 0; for i in memchr_iter(b'\n', text.as_bytes()) { - if parse_headline_level_le(&text[last..], level).is_ok() { + // Check the first byte after the newline to skip parsing unnecessarily. + if text.as_bytes()[last] == b'*' && parse_headline_level_le(&text[last..], level).is_ok() { break; } @@ -697,11 +698,26 @@ mod tests { #[test] fn test_parse_headline() { + assert_eq!(parse_headline(""), None); + assert_eq!(parse_headline("\n"), None); + assert_eq!(parse_headline("Hello"), None); + assert_eq!(parse_headline("Hello\n"), None); + assert_eq!(parse_headline("Hello\r"), None); + assert_eq!(parse_headline("Hello\n\r"), None); + assert_eq!(parse_headline("Hello\r\n"), None); + assert_eq!(parse_headline("Hello\n*"), None); + assert_eq!(parse_headline("Hello\n\n*"), None); + assert_eq!(parse_headline("Hello\r\n*"), None); + assert_eq!(parse_headline("Hello\n\r\n*"), None); + assert_eq!(parse_headline("Hello\r\n\n*"), None); assert_eq!(parse_headline("*"), Some(("", ("*", 1)))); + assert_eq!(parse_headline("*\n"), Some(("", ("*\n", 1)))); + assert_eq!(parse_headline("*\n\r"), Some(("", ("*\n\r", 1)))); assert_eq!(parse_headline("* "), Some(("", ("* ", 1)))); assert_eq!(parse_headline("* \r"), Some(("", ("* \r", 1)))); assert_eq!(parse_headline("*\t"), None); assert_eq!(parse_headline("*\t\n"), None); + assert_eq!(parse_headline("*\r\n"), None); assert_eq!(parse_headline("* \n"), Some(("", ("* \n", 1)))); assert_eq!(parse_headline("* \n\r*"), Some(("", ("* \n\r*", 1)))); assert_eq!(parse_headline("* \n\r**"), Some(("", ("* \n\r**", 1)))); From 849005c1072f0927c184914776adff26c3b195b9 Mon Sep 17 00:00:00 2001 From: Alex Roper Date: Thu, 14 May 2020 02:00:13 -0700 Subject: [PATCH 3/3] Fix drawer parsing to not take headlines. Currently, when parsing a drawer for a node of level N, any headlines of level N + 1 will be absorbed into the drawer. This changes drawer parsing to reject headlines. For example: ``` * Hello :PROPERTIES: ** World :END: ``` should be parsed as two nodes. The drawer is ill-formed, and will be parsed as part of the body instead of as a drawer. --- src/elements/drawer.rs | 44 ++++++++++++++++++++++++++++++++++++++++-- src/elements/title.rs | 2 +- src/parsers.rs | 39 ++++++++++++++++++++++++------------- tests/issue_24.rs | 18 +++++++++++++++++ 4 files changed, 87 insertions(+), 16 deletions(-) create mode 100644 tests/issue_24.rs diff --git a/src/elements/drawer.rs b/src/elements/drawer.rs index 20bb956..50f1e47 100644 --- a/src/elements/drawer.rs +++ b/src/elements/drawer.rs @@ -7,7 +7,10 @@ use nom::{ IResult, }; -use crate::parse::combinators::{blank_lines_count, eol, lines_till}; +use crate::{ + parse::combinators::{blank_lines_count, eol, lines_till}, + parsers::lines_until_headline_at_level_le, +}; /// Drawer Element #[derive(Debug, Default, Clone)] @@ -59,7 +62,18 @@ pub fn parse_drawer_without_blank(input: &str) -> IResult<&str, (Drawer, &str), tag(":"), )(input)?; let (input, _) = eol(input)?; - let (input, contents) = lines_till(|line| line.trim().eq_ignore_ascii_case(":END:"))(input)?; + + // Restrict the search for the end of the drawer to the current headline. + let (_input_after_headline, (input_until_headline, _level)) = + lines_until_headline_at_level_le(input, std::usize::MAX)?; + + // tail is the remaining not used for the drawer out of + // input_until_headline. + let (tail, contents) = + lines_till(|line| line.trim().eq_ignore_ascii_case(":END:"))(input_until_headline)?; + + // Skip over the amount used by the drawer. + let input = &input[input_until_headline.len() - tail.len()..]; Ok(( input, @@ -118,4 +132,30 @@ fn parse() { // https://github.com/PoiScript/orgize/issues/9 assert!(parse_drawer(":SPAGHETTI:\n").is_err()); + + // https://github.com/PoiScript/orgize/issues/24 + // A drawer may not contain a headline. + assert!(parse_drawer( + r#":MYDRAWER: +* Node + :END:"# + ) + .is_err(),); + + // A drawer may not contain another drawer. An attempt to do so will result + // in the drawer ending at the first end line. + assert_eq!( + parse_drawer(":OUTER:\nOuter Text\n:INNER:\nInner Text\n:END:\n:END:"), + Ok(( + ":END:", + ( + Drawer { + name: "OUTER".into(), + pre_blank: 0, + post_blank: 0 + }, + "Outer Text\n:INNER:\nInner Text\n" + ) + )) + ); } diff --git a/src/elements/title.rs b/src/elements/title.rs index b37d484..8f98294 100644 --- a/src/elements/title.rs +++ b/src/elements/title.rs @@ -453,5 +453,5 @@ fn parse_properties_drawer_() { .into_iter() .collect::>() )) - ) + ); } diff --git a/src/parsers.rs b/src/parsers.rs index ffcd438..a9d065e 100644 --- a/src/parsers.rs +++ b/src/parsers.rs @@ -644,7 +644,7 @@ pub fn blank_lines_count(input: &str) -> (&str, usize) { // line, including the terminal \n if one is present. Unlike org-mode (but like // org-element), we accept '\n' and EOF to terminate the stars. Returns the // number of stars. Must only be called at the start of a line. -fn parse_headline_level_le(input: &str, max_level: usize) -> IResult<&str, usize, ()> { +pub(crate) fn parse_headline_level_le(input: &str, max_level: usize) -> IResult<&str, usize, ()> { let (input, level) = verify( map(is_a("*"), |s: &str| s.chars().count()), |level: &usize| *level <= max_level, @@ -667,31 +667,44 @@ fn line_length(input: &str) -> IResult<&str, usize, ()> { } } -pub fn parse_headline(input: &str) -> Option<(&str, (&str, usize))> { - // Consume the headline. - let (text, level) = parse_headline_level_le(input, std::usize::MAX).ok()?; - +// Returns all text until a headline with level <= max_level is found. Must +// start at the start of the line. Can return nothing if immediately at a +// headline. +// +// This is a separate function from lines_while/lines_until because we need to +// treat EOF differently from EOL when the file ends with \r. +pub fn lines_until_headline_at_level_le( + input: &str, + max_level: usize, +) -> IResult<&str, (&str, usize), ()> { // Collect lines until EOF or a headline. let mut last = 0; - for i in memchr_iter(b'\n', text.as_bytes()) { + for i in memchr_iter(b'\n', input.as_bytes()) { // Check the first byte after the newline to skip parsing unnecessarily. - if text.as_bytes()[last] == b'*' && parse_headline_level_le(&text[last..], level).is_ok() { + if input.as_bytes()[last] == b'*' + && parse_headline_level_le(&input[last..], max_level).is_ok() + { break; } last = i + 1; } - if last < text.len() && parse_headline_level_le(&text[last..], level).is_err() { - Some(("", (input, level))) + if last < input.len() && parse_headline_level_le(&input[last..], max_level).is_err() { + Ok(("", (input, max_level))) } else { - Some(( - &text[last..], - (&input[..(input.len() - text.len()) + last], level), - )) + Ok((&input[last..], (&input[..last], max_level))) } } +pub fn parse_headline(input: &str) -> Option<(&str, (&str, usize))> { + // Consume the headline. + let (text, level) = parse_headline_level_le(input, std::usize::MAX).ok()?; + let (text, _content) = lines_until_headline_at_level_le(text, level).ok()?; + let split = input.len() - text.len(); + Some((&input[split..], (&input[..split], level))) +} + #[cfg(test)] mod tests { use super::*; diff --git a/tests/issue_24.rs b/tests/issue_24.rs new file mode 100644 index 0000000..e9dfb2a --- /dev/null +++ b/tests/issue_24.rs @@ -0,0 +1,18 @@ +use orgize::Org; + +#[test] +fn headline_in_drawer() { + // https://github.com/PoiScript/orgize/issues/24 + // A drawer may not contain a headline. + const STARS: &str = "****"; + for h1 in 1..STARS.len() { + for h2 in 1..STARS.len() { + let org = crate::Org::parse_string(format!( + "{} Hello\n:PROPERTIES:\n{} World\n:END:", + &STARS[..h1], + &STARS[..h2] + )); + assert_eq!(org.headlines().count(), 2); + } + } +}