将文本拆分为相等长度的字符串,以保持单词的完整性

Split text into equal length strings keeping words intact

本文关键字:字符串 完整性 单词 拆分 文本      更新时间:2023-09-26

我有这样的代码,它可以将较长的行分解为相等长度的字符串数组,保留单词。它还考虑了[[u;#fff;]some text]等格式,它可以拆分文本,以便每个字符串都可以独立转换为html:

var format_re = /'['[([!gbiuso]*;[^;']]*;[^;']]*(?:;|[^']()]*);?[^']]*)']([^']]*'''][^']]*|[^']]*|[^'[]*'[[^']]*)']?/gi;
var format_begin_re = /('['[[!gbiuso]*;[^;]*;[^']]*'])/i;
var format_last_re = /'['[[!gbiuso]*;[^;]*;[^']]*']?$/i;
$.terminal.split_equal = function(str, length, words) {
  var formatting = false;
  var in_text = false;
  var prev_format = '';
  var result = [];
  // add format text as 5th paramter to formatting it's used for
  // data attribute in format function
  var array = str.replace(format_re, function(_, format, text) {
    var semicolons = format.match(/;/g).length;
    // missing semicolons
    if (semicolons == 2) {
      semicolons = ';;';
    } else if (semicolons == 3) {
      semicolons = ';';
    } else {
      semicolons = '';
    }
    // return '[[' + format + ']' + text + ']';
    // closing braket will break formatting so we need to escape
    // those using html entity equvalent
    return '[[' + format + semicolons +
      text.replace(/''']/g, ']').replace(/'n/g, '''n') + ']' +
      text + ']';
  }).split(/'n/g);
  for (var i = 0, len = array.length; i < len; ++i) {
    if (array[i] === '') {
      result.push('');
      continue;
    }
    var line = array[i];
    var first_index = 0;
    var count = 0;
    var space = -1;
    for (var j=0, jlen=line.length; j<jlen; ++j) {
      if (line[j] === '[' && line[j+1] === '[') {
        formatting = true;
      } else if (formatting && line[j] === ']') {
        if (in_text) {
          formatting = false;
          in_text = false;
        } else {
          in_text = true;
        }
      } else if ((formatting && in_text) || !formatting) {
        if (line[j] === '&') { // treat entity as one character
          var m = line.substring(j).match(/^(&[^;]+;)/);
          if (!m) {
            // should never happen if used by terminal,
            // because it always calls $.terminal.encode
            // before this function
            throw new Error("Unclosed html entity in line " +
                            (i+1) + ' at char ' + (j+1));
          }
          j+=m[1].length-2; // because continue adds 1 to j
          // if entity is at the end there is no next loop
          // issue #77
          if (j === jlen-1) {
            result.push(output + m[1]);
          }
          continue;
        } else if (line[j] === ']' && line[j-1] === '''') {
          // escape '] counts as one character
          --count;
        } else {
          ++count;
        }
      }
      function is_space() {
        return line.substring(j-6, j) == '&nbsp;' ||
          line.substring(j-1, j) == ' ';
      }
      if (is_space() && ((formatting && in_text) || !formatting)) {
        space = j;
      }
      if ((count === length || j === jlen-1) &&
          ((formatting && in_text) || !formatting)) {
        var output;
        var after = line.substring(space, j+length+1);
        var text = $('<span>' + after + '</span>').text();
        var can_break = text.match(/'s/);
        if (words && space != -1 && j !== jlen-1 && can_break) {
          // get text to last space
          output = line.substring(first_index, space);
          j = space-1;
          space = -1;
        } else {
          output = line.substring(first_index, j+1);
        }
        if (words) {
          output = output.replace(/^(&nbsp;|'s)+|(&nbsp;|'s)+$/g, '');
        }
        first_index = j+1;
        count = 0;
        if (prev_format) {
          output = prev_format + output;
          if (output.match(']')) {
            prev_format = '';
          }
        }
        // Fix output if formatting not closed
        var matched = output.match(format_re);
        if (matched) {
          var last = matched[matched.length-1];
          if (last[last.length-1] !== ']') {
            prev_format = last.match(format_begin_re)[1];
            output += ']';
          } else if (output.match(format_last_re)) {
            var line_len = output.length;
            // why this line ???
            //var f_len = line_len-last[last.length-1].length;
            output = output.replace(format_last_re, '');
            prev_format = last.match(format_begin_re)[1];
          }
        }
        result.push(output);
      }
    }
  }
  return result;
};

它几乎正常工作,但有些线路比它应该的更短:

is cracker.The term

在这个FIDDLE中,当您去掉格式并选中复选框时,它就可以正常工作。我为此工作了几个小时,不知道为什么这条线更短,任何帮助都将不胜感激。

以下是如何修复原始代码:

在第40行后添加以下内容:

in_text = false;

代码使用in_text标志来确定当前位置是否为常规文本。但是,当它进入格式化标记的区域时,并没有清除标志。这就是问题中用超短线描述的主要问题的原因关怀。

将第76/77行的if语句更改为:

if (is_space() && ((formatting && in_text) || !formatting || (line[j] === '[' && line[j+1] === '['))) {

这解决了一个较小的问题,即在常规文本和格式化文本之间的空格上没有换行。

在这里工作小提琴:https://jsfiddle.net/2w10xp3m/1/

我想我已经用一种简单得多的方法解决了这个问题。首先分解所有单词,然后重新组合行,同时跟踪当前格式。请参阅JsFiddle。

JavaScript

$.terminal.split_equal = function(str, length, words) {
  var result = [],
    currentFormat = null,
    currentLine = '',
    currentLineLengthWithoutFormatting = 0;
  // 1. Split words on &nbsp;
  words = str.split(/&nbsp;/g);
  // 2. Re-assemble lines while keeping track of current formats
  words.forEach(function(word) {
    // Keep track of current format
    var format = word.match(/^'['[([^']]+)']/g),
      wordWithFormatting, wordLength;
    if (format !== null && format[0]) {
      currentFormat = format[0];
      word = word.slice(format[0].length);
    }
    // Apply current format to each word separatly
    wordLength = word.length;
    wordWithFormatting = (currentFormat || '') + word;
    if (currentFormat) {
      if (word.indexOf(']') !== -1) {
        wordLength--;
        currentFormat = null;
      } else {
        wordWithFormatting += ']';
      }
    }
    // Assemble line
    if (currentLineLengthWithoutFormatting + wordLength <= length) {
      // Word still fits on current line
      if (currentLineLengthWithoutFormatting > 0) {
        currentLine += ' ';
        currentLineLengthWithoutFormatting++;
      }
    } else {
      // Need to start new line
      result.push(currentLine);
      currentLine = '';
      currentLineLengthWithoutFormatting = 0;
    }
    currentLine += wordWithFormatting;
    currentLineLengthWithoutFormatting += wordLength;
  });
  if (currentLineLengthWithoutFormatting > 0)
    result.push(currentLine);
  return result;
};

npm包段落生成器将连续的文本拆分为所谓的段落,这些段落分布均匀,字数大小大致相同。这个段落的概念似乎就是你要搜索的。

您可以定义段落的字数。您可以将段落的原则扩展到页面,考虑到一个页面的平均字符数和包含的空间大致相同。

此段落生成器节点脚本从连续文本生成段落。它输出的文本中,每个段落的大小大致相同,从而在文本中提供段落的均匀分布。它不会在数字上拆分文本,例如"1.2"。

有一个选项可以定义段落之间的分隔符,也可以将段落提取到字符串数组中,从中可以应用html标记<p>。检查其文件以获得进一步的澄清。