Pdf.js与文本选择

pdf.js with text selection

本文关键字:选择 文本 js Pdf      更新时间:2023-09-26

如何使PDF中的文本可选择?

我试过了。PDF写得很好,但没有文本选择

https://github.com/mozilla/pdf.js

https://github.com/mozilla/pdf.js/blob/master/web/text_layer_builder.css
https://github.com/mozilla/pdf.js/blob/master/web/text_layer_builder.js
'use strict';
PDFJS.getDocument('file.pdf').then(function(pdf){
    var page_num = 1;
    pdf.getPage(page_num).then(function(page){
        var scale = 1.5;
        var viewport = page.getViewport(scale);
        var canvas = document.getElementById('the-canvas');
        var context = canvas.getContext('2d');
        canvas.height = viewport.height;
        canvas.width = viewport.width;
        var canvasOffset = $(canvas).offset();
        var $textLayerDiv = $('#text-layer').css({
            height : viewport.height+'px',
            width : viewport.width+'px',
            top : canvasOffset.top,
            left : canvasOffset.left
        });
        page.render({
            canvasContext : context,
            viewport : viewport
        });
        page.getTextContent().then(function(textContent){
            var textLayer = new TextLayerBuilder({
                textLayerDiv : $textLayerDiv.get(0),
                pageIndex : page_num - 1,
                viewport : viewport
            });
            textLayer.setTextContent(textContent);
            textLayer.render();
        });
    });
});
<body>
  <div>
    <canvas id="the-canvas" style="border:1px solid black;"></canvas>
    <div id="text-layer" class="textLayer"></div>
  </div>
</body>

在pdf.js版本2.8.61上,检查的答案不再工作,因为renderTextLayer()被集成到pdf.js中,不需要更多的外部源,也不需要jQuery。

以下代码将使PDF文本可选择。它将加载以下PDF文档作为示例,请将其替换为您自己的文档:

https://raw.githubusercontent.com/mozilla/pdf.js/ba2edeae/web/compressed.tracemonkey pldi - 09. - pdf

它主要使用两个html元素:

<canvas id="the-canvas"></canvas>
<div class="textLayer"></div>

canvas用于显示不可选择的文档,. textlayerdiv用于显示可选择的文本。textlayerdiv上的文本都是透明的,所以不可见,它只提供选择效果。


<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="viewport" content="width=device-width,initial-scale=1.0,maximum-scale=1.0,user-scalable=no">
<script src="//mozilla.github.io/pdf.js/build/pdf.js" crossorigin="anonymous"></script>
<link href="//mozilla.github.io/pdf.js/web/viewer.css" rel="stylesheet" type="text/css" />
<style type="text/css">
#the-canvas {
  border: 1px solid black;
  direction: ltr;
}
</style>
</head>
<body>
<h1>PDF.js Previous/Next example</h1>
<div>
  <button id="prev">Previous</button>
  <button id="next">Next</button>
  &nbsp; &nbsp;
  <span>Page: <span id="page_num"></span> / <span id="page_count"></span></span>
</div>
<canvas id="the-canvas"></canvas>
<div class="textLayer"></div>
<script>
// If absolute URL from the remote server is provided, configure the CORS
// header on that server.
var url = '//raw.githubusercontent.com/mozilla/pdf.js/ba2edeae/web/compressed.tracemonkey-pldi-09.pdf';
// Loaded via <script> tag, create shortcut to access PDF.js exports.
var pdfjsLib = window['pdfjs-dist/build/pdf'];
// The workerSrc property shall be specified.
pdfjsLib.GlobalWorkerOptions.workerSrc = '//mozilla.github.io/pdf.js/build/pdf.worker.js';
var pdfDoc = null,
    pageNum = 1,
    pageRendering = false,
    pageNumPending = null,
    //scale = 0.8,
    scale = 1,
    canvas = document.getElementById('the-canvas'),
    ctx = canvas.getContext('2d');
/**
 * Get page info from document, resize canvas accordingly, and render page.
 * @param num Page number.
 */
function renderPage(num) {
  pageRendering = true;
  // Using promise to fetch the page
  pdfDoc.getPage(num).then(function(page) {
    var viewport = page.getViewport({scale: scale});
    canvas.height = viewport.height;
    canvas.width = viewport.width;
    // Render PDF page into canvas context
    var renderContext = {
      canvasContext: ctx,
      viewport: viewport
    };
    var renderTask = page.render(renderContext);
    // Wait for rendering to finish
    renderTask.promise.then(function() {
      pageRendering = false;
      if (pageNumPending !== null) {
        // New page rendering is pending
        renderPage(pageNumPending);
        pageNumPending = null;
      }
    }).then(function() {
      // Returns a promise, on resolving it will return text contents of the page
      return page.getTextContent();
    }).then(function(textContent) {
      // Assign CSS to the textLayer element
      var textLayer = document.querySelector(".textLayer");
      textLayer.style.left = canvas.offsetLeft + 'px';
      textLayer.style.top = canvas.offsetTop + 'px';
      textLayer.style.height = canvas.offsetHeight + 'px';
      textLayer.style.width = canvas.offsetWidth + 'px';
      // Pass the data to the method for rendering of text over the pdf canvas.
      pdfjsLib.renderTextLayer({
        textContent: textContent,
        container: textLayer,
        viewport: viewport,
        textDivs: []
      });
    });
  });
  // Update page counters
  document.getElementById('page_num').textContent = num;
}
/**
 * If another page rendering in progress, waits until the rendering is
 * finised. Otherwise, executes rendering immediately.
 */
function queueRenderPage(num) {
  if (pageRendering) {
    pageNumPending = num;
  } else {
    renderPage(num);
  }
}
/**
 * Displays previous page.
 */
function onPrevPage() {
  if (pageNum <= 1) {
    return;
  }
  pageNum--;
  queueRenderPage(pageNum);
}
document.getElementById('prev').addEventListener('click', onPrevPage);
/**
 * Displays next page.
 */
function onNextPage() {
  if (pageNum >= pdfDoc.numPages) {
    return;
  }
  pageNum++;
  queueRenderPage(pageNum);
}
document.getElementById('next').addEventListener('click', onNextPage);
/**
 * Asynchronously downloads PDF.
 */
pdfjsLib.getDocument(url).promise.then(function(pdfDoc_) {
  pdfDoc = pdfDoc_;
  document.getElementById('page_count').textContent = pdfDoc.numPages;
  // Initial/first page rendering
  renderPage(pageNum);
});
</script>
</body>
</html>

你的javascript代码是完美的。你只需要包含文本图层生成器所依赖的UI实用程序:

https://github.com/mozilla/pdf.js/blob/master/web/ui_utils.js

或者在HTML中:

<script src="https://raw.githubusercontent.com/mozilla/pdf.js/master/web/ui_utils.js"></script>

如果你运行你的代码(没有ui_utils)并检查调试控制台,你会看到ReferenceError: CustomStyle is not defined。在PDFjs的repo中快速搜索一下,你会发现它是在ui_utils.js中定义的。

这是我的最小但完整的代码供您参考。我在这里使用PDFjs的演示pdf。注意,在生产环境中,你不应该链接到raw.github.

<!DOCTYPE html><meta charset="utf-8">
<link rel="stylesheet" href="https://raw.githubusercontent.com/mozilla/pdf.js/master/web/text_layer_builder.css" />
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.1.4/jquery.min.js"></script>
<script src="https://raw.githubusercontent.com/mozilla/pdf.js/master/web/ui_utils.js"></script>
<script src="https://raw.githubusercontent.com/mozilla/pdf.js/master/web/text_layer_builder.js"></script>
<script src="https://mozilla.github.io/pdf.js/build/pdf.js"></script>
<body>
  <div>
    <canvas id="the-canvas" style="border:1px solid black;"></canvas>
    <div id="text-layer" class="textLayer"></div>
  </div>
<script>
'use strict';
PDFJS.getDocument('file.pdf').then(function(pdf){
    var page_num = 1;
    pdf.getPage(page_num).then(function(page){
        var scale = 1.5;
        var viewport = page.getViewport(scale);
        var canvas = $('#the-canvas')[0];
        var context = canvas.getContext('2d');
        canvas.height = viewport.height;
        canvas.width = viewport.width;
        var canvasOffset = $(canvas).offset();
        var $textLayerDiv = $('#text-layer').css({
            height : viewport.height+'px',
            width : viewport.width+'px',
            top : canvasOffset.top,
            left : canvasOffset.left
        });
        page.render({
            canvasContext : context,
            viewport : viewport
        });
        page.getTextContent().then(function(textContent){
           console.log( textContent );
            var textLayer = new TextLayerBuilder({
                textLayerDiv : $textLayerDiv.get(0),
                pageIndex : page_num - 1,
                viewport : viewport
            });
            textLayer.setTextContent(textContent);
            textLayer.render();
        });
    });
});
</script>

经过几个小时的努力,我发现这篇文章对选择文本和使用没有节点的pdf.js非常有帮助。使用Mozilla的PDF. js在JavaScript中自定义PDF渲染

您好,您已经在HTML内容中创建了画布。

画布将不支持文本选择,所以你需要改变画布为另一种方式