Trying to parse LaTeX style displayed equations

Question

Trying to parse LaTeX style displayed equations

geajack opened this issue 3 years ago · comments

Hi there,

I type up a lot of mathematical text in Typora, which is a markdown editor uses a LaTeX style syntax for equations. It looks like this:

This is an equation:
$$
1 + 1 = 2
$$

That will display the equation 1 + 1 = 2 in a mathematical font, on its own line and centered. Notice there is no empty line between the previous paragraph and the opening $$. Syntactically, you can insert one if you like, but it's optional, and Typora does not insert one by default.

I want to write a script to take the markdown files I write in Typora and generate LaTeX files. I've been able to handle the case where there's an empty line between the preceding paragraph and the opening $$, but when there isn't one, I can't seem to get Marko to detect the equation. Here is my code:

import marko
import marko.block
import marko.inline

import re
from sys import stdin

class InlineFormula(marko.inline.InlineElement):
    pattern = re.compile("\\$\\$(.*?)\\$\\$", re.MULTILINE)
    parse_children = False

    def __init__(self, match):
        self.content = match.group(1)

class BlockFormula(marko.block.BlockElement):
    pattern = re.compile("\\$\\$", re.MULTILINE)

    def __init__(self, match):
        self.children = [marko.inline.RawText(match)]

    @classmethod
    def match(cls, source):
        match = source.expect_re("\\$\\$")
        return match

    @classmethod
    def parse(cls, source):
        source.next_line()
        source.consume()
        lines = []
        while not source.exhausted:
            line = source.next_line()
            if line != "$$":
                lines.append(line)
            source.consume()
        return "".join(lines)

class Renderer:

    def render_document(self, element):
        return self.render_children(element)
    
    def render_paragraph(self, element):
        return self.render_children(element) + "\n\n"

    def render_inline_formula(self, element):
        return f"<INLINE FORMULA ({element.content})>"

    def render_block_formula(self, element):
        return "\\begin{equation*}\n" + self.render_children(element) + "\\end{equation*}\n\n"

class Extension:
    elements=[BlockFormula, InlineFormula]
    renderer_mixins = [Renderer]

markdown = marko.Markdown(extensions=[Extension])
text = stdin.read()
output = markdown.convert(text)
print(output)

And here is a sample input file:

This is $$an inline formula$$.

This is a block formula with no gap
$$
1 + 1 = 2
$$

This is a block formula with a gap

$$
1 + 1 = 2
$$

This results in the output:

This is <INLINE FORMULA (an inline formula)>.

This is a block formula with no gap
$$
1 + 1 = 2
$$

This is a block formula with a gap

\begin{equation*}
1 + 1 = 2
\end{equation*}

As you can see, the block formula with no empty line before it has not been parsed. You can see in my code I tried two different implementations - one using marko.block.BlockElement and one using InlineElement, but neither one works.

How can I do this?

Frost Ming · Answer 1 · Sat Mar 27 2021 23:41:25 GMT+0800 (China Standard Time)

I refined your script so that it can work rightly, here you are:

import marko
import marko.block
import marko.inline
from sys import stdin
from marko.md_renderer import MarkdownRenderer

import re


class InlineFormula(marko.inline.InlineElement):
    pattern = re.compile("\\$\\$(.*?)\\$\\$", re.MULTILINE)
    parse_children = False

    def __init__(self, match):
        self.content = match.group(1)


class BlockFormula(marko.block.BlockElement):
    pattern = re.compile(r"\$\$ *\n([\s\S]+?)^\$\$ *$", re.MULTILINE)

    def __init__(self, match):
        self.children = match.group(1)

    @classmethod
    def match(cls, source):
        return source.expect_re(cls.pattern)

    @classmethod
    def parse(cls, source):
        match = source.match
        source.consume()
        return match


class Paragraph(marko.block.Paragraph):
    override = True

    @classmethod
    def break_paragraph(cls, source, lazy=False):
        if BlockFormula.match(source):
            return True
        return super().break_paragraph(source, lazy=lazy)


class Renderer:
    def render_inline_formula(self, element):
        return f"<INLINE FORMULA ({element.content})>"

    def render_block_formula(self, element):
        return "\\begin{equation*}\n" + element.children + "\\end{equation*}\n\n"


class Extension:
    elements = [BlockFormula, InlineFormula, Paragraph]
    renderer_mixins = [Renderer]


markdown = marko.Markdown(renderer=MarkdownRenderer, extensions=[Extension])

text = stdin.read()

output = markdown.convert(text)
print(output)

The above script produces:

This is <INLINE FORMULA (an inline formula)>.

This is a block formula with no gap
\begin{equation*}
1 + 1 = 2
\end{equation*}


This is a block formula with a gap

\begin{equation*}
1 + 1 = 2
\end{equation*}

Some important notes:

CommonMark's paragraph by default continues to consume the next line if no breaking elements are detected. So you need to subclass the default Paragraph element and change the breaking_paragraph judgment. override = True means to "replace" the built-in paragraph element parsing.
No need to rewrite the rendering of paragraph, just use MarkdownRenderer as the base renderer, which renders the AST back to markdown.
Also changed the parsing logic to be more efficient.

Hope that helps(Or you can just close the issue if problems are solved)

geajack · Answer 2 · Sun Mar 28 2021 02:38:15 GMT+0800 (China Standard Time)

Great! That works perfectly. Thanks. Below is my complete script, if anyone else has a similar use-case.

import marko
import marko.block
import marko.inline
from marko.md_renderer import MarkdownRenderer

from sys import stdin
import re
class BlockFormula(marko.block.BlockElement):
    pattern = re.compile(r"\$\$ *\n([\s\S]+?)^\$\$ *$", re.MULTILINE)

    def __init__(self, match):
        self.children = [marko.inline.RawText(match.group(1))]

    @classmethod
    def match(cls, source):
        return source.expect_re(cls.pattern)

    @classmethod
    def parse(cls, source):
        match = source.match
        source.consume()
        return match

class Paragraph(marko.block.Paragraph):
    override = True

    @classmethod
    def break_paragraph(cls, source, lazy=False):
        if BlockFormula.match(source):
            return True
        return super().break_paragraph(source, lazy=lazy)


class Renderer:
    
    def render_document(self, element):
        preamble = \
r"""\documentclass[10pt,a4paper]{article}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}

\begin{document}
"""
        return preamble + "\n" + self.render_children(element) + "\\end{document}"
    
    def render_paragraph(self, element):
        return self.render_children(element) + "\n"

    def render_heading(self, element):
        heading_level = element.level
        if heading_level == 1:
            return \
r"""\title{""" + self.render_children(element) + r"""}
\maketitle
"""
        else:
            if heading_level == 2:
                tag = "section"
            elif heading_level == 3:
                tag = "subsection"
            else:
                tag = "subsubsection"
            return f"\{tag}{{" + self.render_children(element) + "}\n"

    def render_strong_emphasis(self, element):
        return "\\textbf{" + self.render_children(element) + "}"

    def render_emphasis(self, element):
        return "\\textit{" + self.render_children(element) + "}"

    def render_block_formula(self, element):
        return "\n\\begin{equation*}\n" + self.render_children(element) + "\\end{equation*}\n"

class Extension:
    elements = [BlockFormula, Paragraph]
    renderer_mixins = [Renderer]

if __name__ == "__main__":
    markdown = marko.Markdown(renderer=MarkdownRenderer, extensions=[Extension])
    text = stdin.read()
    output = markdown.convert(text)
    print(output)