Trying to parse LaTeX style displayed equations
geajack opened this issue · comments
Hi there,
I type up a lot of mathematical text in Typora, which is a markdown editor uses a LaTeX style syntax for equations. It looks like this:
This is an equation:
$$
1 + 1 = 2
$$
That will display the equation 1 + 1 = 2
in a mathematical font, on its own line and centered. Notice there is no empty line between the previous paragraph and the opening $$
. Syntactically, you can insert one if you like, but it's optional, and Typora does not insert one by default.
I want to write a script to take the markdown files I write in Typora and generate LaTeX files. I've been able to handle the case where there's an empty line between the preceding paragraph and the opening $$
, but when there isn't one, I can't seem to get Marko to detect the equation. Here is my code:
import marko
import marko.block
import marko.inline
import re
from sys import stdin
class InlineFormula(marko.inline.InlineElement):
pattern = re.compile("\\$\\$(.*?)\\$\\$", re.MULTILINE)
parse_children = False
def __init__(self, match):
self.content = match.group(1)
class BlockFormula(marko.block.BlockElement):
pattern = re.compile("\\$\\$", re.MULTILINE)
def __init__(self, match):
self.children = [marko.inline.RawText(match)]
@classmethod
def match(cls, source):
match = source.expect_re("\\$\\$")
return match
@classmethod
def parse(cls, source):
source.next_line()
source.consume()
lines = []
while not source.exhausted:
line = source.next_line()
if line != "$$":
lines.append(line)
source.consume()
return "".join(lines)
class Renderer:
def render_document(self, element):
return self.render_children(element)
def render_paragraph(self, element):
return self.render_children(element) + "\n\n"
def render_inline_formula(self, element):
return f"<INLINE FORMULA ({element.content})>"
def render_block_formula(self, element):
return "\\begin{equation*}\n" + self.render_children(element) + "\\end{equation*}\n\n"
class Extension:
elements=[BlockFormula, InlineFormula]
renderer_mixins = [Renderer]
markdown = marko.Markdown(extensions=[Extension])
text = stdin.read()
output = markdown.convert(text)
print(output)
And here is a sample input file:
This is $$an inline formula$$.
This is a block formula with no gap
$$
1 + 1 = 2
$$
This is a block formula with a gap
$$
1 + 1 = 2
$$
This results in the output:
This is <INLINE FORMULA (an inline formula)>.
This is a block formula with no gap
$$
1 + 1 = 2
$$
This is a block formula with a gap
\begin{equation*}
1 + 1 = 2
\end{equation*}
As you can see, the block formula with no empty line before it has not been parsed. You can see in my code I tried two different implementations - one using marko.block.BlockElement
and one using InlineElement
, but neither one works.
How can I do this?
I refined your script so that it can work rightly, here you are:
import marko
import marko.block
import marko.inline
from sys import stdin
from marko.md_renderer import MarkdownRenderer
import re
class InlineFormula(marko.inline.InlineElement):
pattern = re.compile("\\$\\$(.*?)\\$\\$", re.MULTILINE)
parse_children = False
def __init__(self, match):
self.content = match.group(1)
class BlockFormula(marko.block.BlockElement):
pattern = re.compile(r"\$\$ *\n([\s\S]+?)^\$\$ *$", re.MULTILINE)
def __init__(self, match):
self.children = match.group(1)
@classmethod
def match(cls, source):
return source.expect_re(cls.pattern)
@classmethod
def parse(cls, source):
match = source.match
source.consume()
return match
class Paragraph(marko.block.Paragraph):
override = True
@classmethod
def break_paragraph(cls, source, lazy=False):
if BlockFormula.match(source):
return True
return super().break_paragraph(source, lazy=lazy)
class Renderer:
def render_inline_formula(self, element):
return f"<INLINE FORMULA ({element.content})>"
def render_block_formula(self, element):
return "\\begin{equation*}\n" + element.children + "\\end{equation*}\n\n"
class Extension:
elements = [BlockFormula, InlineFormula, Paragraph]
renderer_mixins = [Renderer]
markdown = marko.Markdown(renderer=MarkdownRenderer, extensions=[Extension])
text = stdin.read()
output = markdown.convert(text)
print(output)
The above script produces:
This is <INLINE FORMULA (an inline formula)>.
This is a block formula with no gap
\begin{equation*}
1 + 1 = 2
\end{equation*}
This is a block formula with a gap
\begin{equation*}
1 + 1 = 2
\end{equation*}
Some important notes:
- CommonMark's paragraph by default continues to consume the next line if no breaking elements are detected. So you need to subclass the default Paragraph element and change the breaking_paragraph judgment.
override = True
means to "replace" the built-in paragraph element parsing. - No need to rewrite the rendering of paragraph, just use
MarkdownRenderer
as the base renderer, which renders the AST back to markdown. - Also changed the parsing logic to be more efficient.
Hope that helps(Or you can just close the issue if problems are solved)
Great! That works perfectly. Thanks. Below is my complete script, if anyone else has a similar use-case.
import marko
import marko.block
import marko.inline
from marko.md_renderer import MarkdownRenderer
from sys import stdin
import re
class BlockFormula(marko.block.BlockElement):
pattern = re.compile(r"\$\$ *\n([\s\S]+?)^\$\$ *$", re.MULTILINE)
def __init__(self, match):
self.children = [marko.inline.RawText(match.group(1))]
@classmethod
def match(cls, source):
return source.expect_re(cls.pattern)
@classmethod
def parse(cls, source):
match = source.match
source.consume()
return match
class Paragraph(marko.block.Paragraph):
override = True
@classmethod
def break_paragraph(cls, source, lazy=False):
if BlockFormula.match(source):
return True
return super().break_paragraph(source, lazy=lazy)
class Renderer:
def render_document(self, element):
preamble = \
r"""\documentclass[10pt,a4paper]{article}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\begin{document}
"""
return preamble + "\n" + self.render_children(element) + "\\end{document}"
def render_paragraph(self, element):
return self.render_children(element) + "\n"
def render_heading(self, element):
heading_level = element.level
if heading_level == 1:
return \
r"""\title{""" + self.render_children(element) + r"""}
\maketitle
"""
else:
if heading_level == 2:
tag = "section"
elif heading_level == 3:
tag = "subsection"
else:
tag = "subsubsection"
return f"\{tag}{{" + self.render_children(element) + "}\n"
def render_strong_emphasis(self, element):
return "\\textbf{" + self.render_children(element) + "}"
def render_emphasis(self, element):
return "\\textit{" + self.render_children(element) + "}"
def render_block_formula(self, element):
return "\n\\begin{equation*}\n" + self.render_children(element) + "\\end{equation*}\n"
class Extension:
elements = [BlockFormula, Paragraph]
renderer_mixins = [Renderer]
if __name__ == "__main__":
markdown = marko.Markdown(renderer=MarkdownRenderer, extensions=[Extension])
text = stdin.read()
output = markdown.convert(text)
print(output)