// 完毕:
using System;
using System.IO;
using System.Drawing;
using System.Linq;
using System.Collections.Generic;
using GrapeCity.Documents.Pdf;
using GrapeCity.Documents.Text;
using GrapeCity.Documents.Pdf.TextMap;
using GrapeCity.Documents.Pdf.Structure;
using GrapeCity.Documents.Pdf.Recognition.Structure;
using GCTEXT = GrapeCity.Documents.Text;
using GCDRAW = GrapeCity.Documents.Drawing;
namespace DsPdfWeb.Demos
{
// Find tables and read their data using structure tags.
public class ReadTagsTableData
{
private TextFormat _tf, _tfHdr, _tfPgHdr;
private float _margin = 72;
public int CreatePDF(Stream stream)
{
// Set up some text formats:
_tf = new TextFormat()
{
Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "segoeui.ttf")),
FontSize = 9,
ForeColor = Color.Black
};
_tfHdr = new TextFormat(_tf)
{
Font = GCTEXT.Font.FromFile(Path.Combine("Resources", "Fonts", "segoeuib.ttf")),
FontSize = 11,
ForeColor = Color.DarkBlue
};
_tfPgHdr = new TextFormat(_tf)
{
FontSize = 11,
ForeColor = Color.Gray
};
// The resulting PDF:
var doc = new GcPdfDocument();
using (var s = File.OpenRead(Path.Combine("Resources", "PDFs", "C1Olap-QuickStart.pdf")))
{
var source = new GcPdfDocument();
source.Load(s);
PrintAllTables(doc, source);
}
// Save the PDF:
doc.Save(stream);
return doc.Pages.Count;
}
private void PrintAllTables(GcPdfDocument doc, GcPdfDocument source)
{
// Get the LogicalStructure and top parent element:
LogicalStructure ls = source.GetLogicalStructure();
if (ls == null || ls.Elements == null || ls.Elements.Count == 0)
{
// No structure tags found:
Common.Util.AddNote("No structure tags were found in the source document.", doc.Pages.Add());
return;
}
// The root element:
Element root = ls.Elements[0];
// Find and print all tables:
var tables = new List<(TextLayout, Page)>();
root.Children.ToList().FindAll(e_ => e_.StructElement.Type == "Table").ForEach(t_ => tables.Add(PrintTable(t_)));
// Group tables by the page they were found on:
var tablesByPage = tables.GroupBy(t_ => t_.Item2.Index);
// For each page, print all tables found on that page,
// followed by the original page for reference:
foreach (var tbp in tablesByPage)
{
// The page that will contain the extracted table data:
var pgTables = doc.NewPage();
// The page that will contain the source page for reference:
var pgSrc = doc.NewPage();
// Print the original page:
tbp.First().Item2.Draw(pgSrc.Graphics, pgSrc.Bounds);
// Add a page header:
pgSrc.Graphics.DrawString($"Page {tbp.First().Item2.Index + 1} of the source PDF",
_tfPgHdr, new RectangleF(0, 0, pgSrc.Size.Width, _margin), TextAlignment.Center, ParagraphAlignment.Center, false);
//
float maxHeight = pgTables.Size.Height - _margin * 2;
float y = _margin;
// Print all table data. For simplicity sake we assume that all table data will fit on a single page:
foreach (var t in tbp)
{
t.Item1.MaxHeight = maxHeight;
t.Item1.MaxWidth = pgTables.Size.Width - _margin * 2;
pgTables.Graphics.DrawTextLayout(t.Item1, new PointF(_margin, y));
maxHeight -= t.Item1.ContentHeight + _margin;
y += t.Item1.ContentHeight + _margin;
}
}
}
private (TextLayout, Page) PrintTable(Element e)
{
if (e.Type != "Table")
throw new Exception($"Unexpected: element type must be 'Table' but it is '{e.Type}'.");
List<List<IList<ITextParagraph>>> table = new List<List<IList<ITextParagraph>>>();
int maxCols = 0;
// Select all child elements with type TR - table rows:
void SelectRows(IReadOnlyList<Element> elements)
{
foreach (Element ec in elements)
{
if (ec.HasChildren)
{
if (ec.StructElement.Type == "TR")
{
var cells = ec.Children.ToList().FindAll((e_) => e_.StructElement.Type == "TD").ToArray();
maxCols = Math.Max(maxCols, cells.Length);
List<IList<ITextParagraph>> tableCells = new List<IList<ITextParagraph>>();
foreach (var cell in cells)
tableCells.Add(cell.GetParagraphs());
table.Add(tableCells);
}
else
SelectRows(ec.Children);
}
}
}
SelectRows(e.Children);
// show table
var sourcePage = FindPage(e.StructElement);
if (sourcePage == null)
throw new Exception("Unexpected: could not find the default page for the table.");
var tl = new TextLayout(72);
// Add table data to the text layout:
tl.Append($"\nTable on page {sourcePage.Index + 1} of the source document has {maxCols} column(s) and {table.Count} row(s).\nData by row:", _tfHdr);
tl.AppendParagraphBreak();
int irow = 0;
foreach (var row in table)
{
int icol = 0;
foreach (var cell in row)
{
foreach (var para in cell)
{
tl.Append(para.GetText());
}
if (row.IndexOf(cell) <= row.Count)
tl.Append("\t");
else
tl.AppendLine();
++icol;
}
++irow;
tl.AppendLine();
}
return (tl, sourcePage);
}
private Page FindPage(StructElement se)
{
if (se.DefaultPage != null)
return se.DefaultPage;
if (se.HasChildren)
foreach (var child in se.Children)
{
var p = FindPage(child);
if (p != null)
return p;
}
return null;
}
}
}