ExtractParagraphs.vb
'' 完毕:
Imports System.IO
Imports System.Drawing
Imports System.Numerics
Imports System.Collections.Generic
Imports System.Linq
Imports GrapeCity.Documents.Text
Imports GrapeCity.Documents.Drawing
Imports GrapeCity.Documents.Pdf
Imports GrapeCity.Documents.Pdf.Annotations
Imports GrapeCity.Documents.Pdf.Graphics
Imports GCTEXT = GrapeCity.Documents.Text
Imports GCDRAW = GrapeCity.Documents.Drawing

'' 此示例演示如何从现有 PDF 中提取文本。
'' 它将任意 PDF 加载到临时 GcPdfDocument 中,然后
'' 使用 Page.GetText() 方法从该文档的每个页面检索文本,
'' 将所有这些文本添加到 TextLayout 并将其呈现到当前文档中。
'' Page.GetText() 的替代方法是 GcPdfDocument.GetText() 方法
'' 它立即从整个文档中检索文本。
Public Class ExtractParagraphs
    Function CreatePDF(ByVal stream As Stream) As Integer
        Const margin = 36
        Dim c1 = Color.PaleGreen
        Dim c2 = Color.PaleGoldenrod

        Dim doc = New GcPdfDocument()
        Dim page = doc.NewPage()

        Dim rc = Util.AddNote(
            "这里我们将现有的 PDF (Wetlands) 加载到临时 GcPdfDocument 中," +
            "",
            page,
            New RectangleF(margin, margin, page.Size.Width - margin * 2, 0))

        '' 标题的文本格式:
        Dim tf = New TextFormat() With
            {
                .Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "yumin.ttf")),
                .FontSize = 14,
                .ForeColor = Color.Blue
            }
        '' 段落的文本格式:
        Dim tfpar = New TextFormat() With
        {
            .Font = StandardFonts.Times,
            .FontSize = 12,
            .BackColor = c1
        }
        '' 渲染文本的文本布局:
        Dim tl = page.Graphics.CreateTextLayout()
        tl.MaxWidth = doc.PageSize.Width
        tl.MaxHeight = doc.PageSize.Height
        tl.MarginAll = rc.Left
        tl.MarginTop = rc.Bottom + 36
        '' 寡妇/孤儿控制的文本分割选项:
        Dim topt = New TextSplitOptions(tl) With
        {
            .MinLinesInFirstParagraph = 2,
            .MinLinesInLastParagraph = 2,
            .RestMarginTop = rc.Left
        }

        '' 打开任意 PDF,将其加载到临时文档中并获取所有页面文本
        Using fs = File.OpenRead(Path.Combine("Resources", "PDFs", "Wetlands.pdf"))
            Dim doc1 = New GcPdfDocument()
            doc1.Load(fs)

            For i = 0 To doc1.Pages.Count - 1
                tl.AppendLine(String.Format("原始 PDF 第 {0} 页的段落:", i + 1), tf)

                Dim pg = doc1.Pages(i)
                Dim pars = pg.GetTextMap().Paragraphs
                For Each par In pars
                    tl.AppendLine(par.GetText(), tfpar)
                    If tfpar.BackColor = c1 Then
                        tfpar.BackColor = c2
                    Else
                        tfpar.BackColor = c1
                    End If
                Next
            Next

            tl.PerformLayout(True)
            While True
                '' 'rest' 将接受不适合的文本:
                Dim rest As TextLayout = Nothing
                Dim splitResult = tl.Split(topt, rest)
                doc.Pages.Last.Graphics.DrawTextLayout(tl, PointF.Empty)
                If splitResult <> SplitResult.Split Then
                    Exit While
                End If
                tl = rest
                doc.NewPage()
            End While
            '' 附上原始文件以供参考:
            doc.MergeWithDocument(doc1, New MergeDocumentOptions())

            '' 完毕:
            doc.Save(stream)
            Return doc.Pages.Count
        End Using
    End Function
End Class