kevinboone / epub2txt2

A simple command-line utility for Linux, for extracting text from EPUB documents.

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

Enable/disable ANSI escape sequences on screen lines instead of only following XHTML open/close tags.

rajeevvp opened this issue · comments

epub2txt should, I think, set and reset ANSI escape sequences for things like <em> tags for
each on-screen line instead of only setting/resetting them when the open/close tags are
encountered.

Otherwise, since less -R resets ANSI colour attributes automatically at the end of each line,
when escape-sequences spanning multiple lines, only the first one has the proper colour
attribute.

more doesn't have this problem, and neither does less -r, but both can potentially mess-up
the display in other ways.

Thanks,
RVP

And, here is a minimally invasive patch to do just that:

diff -urN epub2txt2-master.orig/src/wrap.c epub2txt2-master/src/wrap.c
--- epub2txt2-master.orig/src/wrap.c	2020-01-04 22:30:09.000000000 +0530
+++ epub2txt2-master/src/wrap.c	2020-01-09 14:23:59.128292592 +0530
@@ -33,7 +33,9 @@
   int state;
   int column;
   int white_count;
+  unsigned int fmt;
   void *app_data;
+  void *app_opts;
   BOOL blank_line;
   WT_UTF32 last;
   WT_UTF32 *token;
@@ -138,6 +140,7 @@
   if (l + context->priv->column + 1 >= context->priv->width)
     {
     _wraptext_emit_newline (context);
+    xhtml_emit_fmt_eol (context);
     context->priv->column = 0;
     }
  
@@ -330,6 +333,7 @@
   self->priv->column = 0;
   self->priv->last = 0;
   self->priv->white_count = 0;
+  self->priv->fmt = 0;
   self->priv->blank_line = TRUE;
   if (self->priv->token) free (self->priv->token);
   self->priv->token = NULL;
@@ -353,6 +357,36 @@
   self->priv->flags = flags;
   }
 
+void wraptext_context_zero_fmt (WrapTextContext *self)
+  {
+  self->priv->fmt = 0;
+  }
+
+unsigned int wraptext_context_get_fmt (WrapTextContext *self)
+  {
+  return self->priv->fmt;
+  }
+
+void wraptext_context_set_fmt (WrapTextContext *self, unsigned int fmt)
+  {
+  self->priv->fmt |= fmt;
+  }
+
+void wraptext_context_reset_fmt (WrapTextContext *self, unsigned int fmt)
+  {
+  self->priv->fmt &= ~fmt;
+  }
+
+void wraptext_context_set_app_opts (WrapTextContext *self, void *app_opts)
+  {
+  self->priv->app_opts = app_opts;
+  }
+
+void *wraptext_context_get_app_opts (WrapTextContext *self)
+  {
+  return self->priv->app_opts;
+  }
+
 void wraptext_context_set_app_data (WrapTextContext *self, void *app_data)
   {
   self->priv->app_data = app_data;
diff -urN epub2txt2-master.orig/src/wrap.h epub2txt2-master/src/wrap.h
--- epub2txt2-master.orig/src/wrap.h	2020-01-04 22:30:09.000000000 +0530
+++ epub2txt2-master/src/wrap.h	2020-01-09 14:22:44.780569828 +0530
@@ -38,6 +38,13 @@
 void wraptext_context_set_output_fn (WrapTextContext *self, 
   WrapTextOutputFn fn);
 
+unsigned int wraptext_context_get_fmt (WrapTextContext *self);
+void wraptext_context_zero_fmt (WrapTextContext *self);
+void wraptext_context_set_fmt (WrapTextContext *self, unsigned int fmt);
+void wraptext_context_reset_fmt (WrapTextContext *self, unsigned int fmt);
+void wraptext_context_set_app_opts (WrapTextContext *self, void *app_opts);
+void *wraptext_context_get_app_opts (WrapTextContext *self);
+
 void wraptext_context_set_flags (WrapTextContext *self, int flags);
 
 void wraptext_context_set_width (WrapTextContext *self, int width);
diff -urN epub2txt2-master.orig/src/xhtml.c epub2txt2-master/src/xhtml.c
--- epub2txt2-master.orig/src/xhtml.c	2020-01-04 22:30:09.000000000 +0530
+++ epub2txt2-master/src/xhtml.c	2020-01-09 14:40:49.054307798 +0530
@@ -35,6 +35,8 @@
                FORMAT_H4_ON, FORMAT_H4_OFF,
                FORMAT_H5_ON, FORMAT_H5_OFF } Format;
 
+enum { FMT_BOLD = 0x01, FMT_ITAL = 0x02 };	/* bitmasks */
+
 
 /*============================================================================
   xhtml_is_start_format_tag
@@ -208,6 +210,83 @@
   OUT
   }
 
+/*============================================================================
+  xhtml_emit_fmt_eol
+============================================================================*/
+void xhtml_emit_fmt_eol (WrapTextContext *context)
+  {
+  IN
+  
+  unsigned int fmt = wraptext_context_get_fmt(context);
+  const Epub2TxtOptions *options = (Epub2TxtOptions *) wraptext_context_get_app_opts(context);
+
+  if (options->ansi && !options->raw && fmt)
+    {
+    /* reset ANSI escape-sequence at EOL. */
+    xhtml_emit_format(options, FORMAT_BOLD_OFF);
+
+    /* turn those set back on at BOL. */
+    if (fmt & FMT_BOLD)
+         xhtml_emit_format(options, FORMAT_BOLD_ON);
+    if (fmt & FMT_ITAL)
+         xhtml_emit_format(options, FORMAT_ITALIC_ON);
+
+    }
+  OUT
+  }
+
+/*============================================================================
+  xhtml_set_format
+============================================================================*/
+void xhtml_set_format (const Epub2TxtOptions *options, Format format, WrapTextContext *context)
+  {
+  IN
+  
+  if (options->ansi && !options->raw)
+    {
+    switch (format)
+      {
+      case FORMAT_BOLD_ON:
+         wraptext_context_set_fmt(context, FMT_BOLD);
+         break;
+
+      case FORMAT_BOLD_OFF:
+         wraptext_context_reset_fmt(context, FMT_BOLD);
+         break;
+
+      case FORMAT_ITALIC_ON:
+	 wraptext_context_set_fmt(context, FMT_ITAL);
+         break;
+
+      case FORMAT_ITALIC_OFF:
+	 wraptext_context_reset_fmt(context, FMT_ITAL);
+         break;
+
+      case FORMAT_NONE:
+         wraptext_context_zero_fmt(context);
+	 break;
+
+      case FORMAT_H1_ON:
+      case FORMAT_H2_ON:
+      case FORMAT_H3_ON:
+      case FORMAT_H4_ON:
+      case FORMAT_H5_ON:
+         wraptext_context_set_fmt(context, FMT_BOLD);
+	 break;
+
+      case FORMAT_H1_OFF:
+      case FORMAT_H2_OFF:
+      case FORMAT_H3_OFF:
+      case FORMAT_H4_OFF:
+      case FORMAT_H5_OFF:
+         wraptext_context_reset_fmt(context, FMT_BOLD);
+	 break;
+
+      }
+    }
+  OUT
+  }
+
 
 
 
@@ -629,6 +708,7 @@
 
      WrapTextContext *context = wraptext_context_new();
      wraptext_context_set_width (context, width);
+     wraptext_context_set_app_opts (context, (void *)options);
 
      Mode mode = MODE_ANY;
      BOOL inbody = FALSE;
@@ -771,6 +851,7 @@
 	      xhtml_flush_line (para, options, context); 
 	      wstring_clear (para);
               xhtml_emit_format (options, format);
+              xhtml_set_format(options, format, context);
 	      }
 	    }
 	  else if (xhtml_is_end_format_tag (ss_tag, &format))
@@ -779,6 +860,7 @@
 	      {
 	      xhtml_flush_line (para, options, context); 
               xhtml_emit_format (options, format);
+              xhtml_set_format(options, format, context);
 	      wstring_clear (para);
 	      }
             }
@@ -786,6 +868,7 @@
 	    {
             xhtml_flush_line (para, options, context);
             xhtml_emit_format (options, format);
+            xhtml_set_format(options, format, context);
 	    wstring_clear (para);
 	    xhtml_para_break (context, options);
             }
@@ -795,6 +878,7 @@
             xhtml_flush_line (para, options, context);
 	    wstring_clear (para);
             xhtml_emit_format (options, format);
+            xhtml_set_format(options, format, context);
             }
 
 	  free (ss_tag);