tlienart / FranklinParser.jl

currently removed this path, before what we were doing is:

For a text span, determine which tokens are in that span
Bundle in a TextBlock with inner tokens a view of the tokens in the relevant range.

See here in partition:

Lines 46 to 76 in e9d75f9

    
           isempty(blocks) && return [TextBlock(s, tokens)] 
        
           # disable additional blocks if desired 
        
           isempty(disable) || filter!(t -> t.name ∉ disable, blocks) 
        
           # Form a full partition with text blocks and blocks. 
        
           parent = parent_string(s) 
        
           first_block = blocks[1] 
        
           last_block  = blocks[end] 
        
           # add Text at beginning if first block is not there 
        
           if from(s) < from(first_block) 
        
               inter = subs(parent, from(s), prev_index(first_block)) 
        
               tb    = TextBlock(inter, tokens) 
        
               push!(parts, tb) 
        
           end 
        
           # Go through blocks and add text with what's between them 
        
           for i in 1:length(blocks)-1 
        
               bi   = blocks[i] 
        
               bip1 = blocks[i+1] 
        
               push!(parts, blocks[i]) 
        
               inter = subs(parent, next_index(bi), prev_index(bip1)) 
        
               isempty(inter) || push!(parts, TextBlock(inter, tokens)) 
        
           end 
        
           push!(parts, last_block) 
        
           # add Text at the end if last block is not there 
        
           if to(s) > to(last_block) 
        
               inter = subs(parent, next_index(last_block), to(s)) 
        
               push!(parts, TextBlock(inter, tokens))

The relevant function was using findfirst/findlast and it ended up being accumulating a lot

FranklinParser.jl/src/utils/types.jl

Lines 74 to 83 in e9d75f9

    
           function TextBlock(ss::SS, it=EMPTY_TOKEN_SVEC)::Block 
        
               isempty(it) && return Block(:TEXT, ss) 
        
               fss = from(ss) 
        
               tss = to(ss) 
        
               i = findfirst(t -> fss <= from(t), it) 
        
               j = findlast(t -> to(t) <= tss && !is_eos(t), it) 
        
               any(isnothing, (i, j)) && return Block(:TEXT, ss) 
        
               inner_tokens = @view it[i:j] 
        
               return Block(:TEXT, ss, inner_tokens) 
        
           end

However there's no magic, if we have to retokenize every text block on the Franklin side, it does take some time; so ideally we'd do this in a performant way at TextBlock that does not use this findfirst/findlast stuff.

once this is done; need to make Xranklin use it properly; e.g. processing env stuff needs to use the inner tokens instead of just repartitioning.

https://github.com/tlienart/Xranklin.jl/blob/5b92df341b36747b71c127e42ac117117432c14b/src/convert/markdown/latex_objects.jl#L564-L574

also potentially item candidates, rows, links, ...

for link stuff, need to review whether the stuff done in Xranklin is not over the top; seems like there's link-type detection again from b.ss this should not be required.

	isempty(blocks) && return [TextBlock(s, tokens)]

	# disable additional blocks if desired
	isempty(disable) \|\| filter!(t -> t.name ∉ disable, blocks)

	# Form a full partition with text blocks and blocks.
	parent = parent_string(s)
	first_block = blocks[1]
	last_block = blocks[end]

	# add Text at beginning if first block is not there
	if from(s) < from(first_block)
	inter = subs(parent, from(s), prev_index(first_block))
	tb = TextBlock(inter, tokens)
	push!(parts, tb)
	end

	# Go through blocks and add text with what's between them
	for i in 1:length(blocks)-1
	bi = blocks[i]
	bip1 = blocks[i+1]
	push!(parts, blocks[i])
	inter = subs(parent, next_index(bi), prev_index(bip1))
	isempty(inter) \|\| push!(parts, TextBlock(inter, tokens))
	end
	push!(parts, last_block)

	# add Text at the end if last block is not there
	if to(s) > to(last_block)
	inter = subs(parent, next_index(last_block), to(s))
	push!(parts, TextBlock(inter, tokens))

	function TextBlock(ss::SS, it=EMPTY_TOKEN_SVEC)::Block
	isempty(it) && return Block(:TEXT, ss)
	fss = from(ss)
	tss = to(ss)
	i = findfirst(t -> fss <= from(t), it)
	j = findlast(t -> to(t) <= tss && !is_eos(t), it)
	any(isnothing, (i, j)) && return Block(:TEXT, ss)
	inner_tokens = @view it[i:j]
	return Block(:TEXT, ss, inner_tokens)
	end

TextBlock and inner_tokens