WIP: PDF Attachment Extractor

This commit is contained in:
Jonathan Jenne 2020-03-04 16:42:06 +01:00
parent b995304ee3
commit c3c6a50992
6 changed files with 126 additions and 12 deletions

View File

@ -32,6 +32,7 @@ Partial Class Form1
Me.Button5 = New System.Windows.Forms.Button()
Me.GroupBox1 = New System.Windows.Forms.GroupBox()
Me.GroupBox2 = New System.Windows.Forms.GroupBox()
Me.Button7 = New System.Windows.Forms.Button()
Me.GroupBox3 = New System.Windows.Forms.GroupBox()
Me.txtPropName = New System.Windows.Forms.TextBox()
Me.GroupBox4 = New System.Windows.Forms.GroupBox()
@ -60,7 +61,7 @@ Partial Class Form1
Me.ListBox1.FormattingEnabled = True
Me.ListBox1.Location = New System.Drawing.Point(378, 12)
Me.ListBox1.Name = "ListBox1"
Me.ListBox1.Size = New System.Drawing.Size(526, 407)
Me.ListBox1.Size = New System.Drawing.Size(526, 472)
Me.ListBox1.TabIndex = 1
'
'Button2
@ -122,19 +123,29 @@ Partial Class Form1
'GroupBox2
'
Me.GroupBox2.Controls.Add(Me.Button2)
Me.GroupBox2.Controls.Add(Me.Button7)
Me.GroupBox2.Controls.Add(Me.Button3)
Me.GroupBox2.Location = New System.Drawing.Point(12, 110)
Me.GroupBox2.Name = "GroupBox2"
Me.GroupBox2.Size = New System.Drawing.Size(360, 90)
Me.GroupBox2.Size = New System.Drawing.Size(360, 155)
Me.GroupBox2.TabIndex = 8
Me.GroupBox2.TabStop = False
Me.GroupBox2.Text = "Run Functions on a single file (needs Breakpoint)"
'
'Button7
'
Me.Button7.Location = New System.Drawing.Point(6, 77)
Me.Button7.Name = "Button7"
Me.Button7.Size = New System.Drawing.Size(221, 23)
Me.Button7.TabIndex = 3
Me.Button7.Text = "Extract PDF Attachments"
Me.Button7.UseVisualStyleBackColor = True
'
'GroupBox3
'
Me.GroupBox3.Controls.Add(Me.Button4)
Me.GroupBox3.Controls.Add(Me.txtMD5Checksum)
Me.GroupBox3.Location = New System.Drawing.Point(12, 206)
Me.GroupBox3.Location = New System.Drawing.Point(12, 271)
Me.GroupBox3.Name = "GroupBox3"
Me.GroupBox3.Size = New System.Drawing.Size(360, 85)
Me.GroupBox3.TabIndex = 9
@ -152,7 +163,7 @@ Partial Class Form1
'
Me.GroupBox4.Controls.Add(Me.Button6)
Me.GroupBox4.Controls.Add(Me.txtPropName)
Me.GroupBox4.Location = New System.Drawing.Point(12, 297)
Me.GroupBox4.Location = New System.Drawing.Point(12, 362)
Me.GroupBox4.Name = "GroupBox4"
Me.GroupBox4.Size = New System.Drawing.Size(360, 122)
Me.GroupBox4.TabIndex = 11
@ -172,7 +183,7 @@ Partial Class Form1
'
Me.AutoScaleDimensions = New System.Drawing.SizeF(6.0!, 13.0!)
Me.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font
Me.ClientSize = New System.Drawing.Size(916, 435)
Me.ClientSize = New System.Drawing.Size(916, 492)
Me.Controls.Add(Me.GroupBox4)
Me.Controls.Add(Me.GroupBox3)
Me.Controls.Add(Me.GroupBox2)
@ -204,4 +215,5 @@ Partial Class Form1
Friend WithEvents txtPropName As TextBox
Friend WithEvents GroupBox4 As GroupBox
Friend WithEvents Button6 As Button
Friend WithEvents Button7 As Button
End Class

View File

@ -85,6 +85,7 @@ Public Class Form1
Dim args As New WorkerArgs()
args = LoadFolderConfig(args)
args = LoadPropertyMapFor(args, "DEFAULT")
args.GDPictureKey = "21182889975216572111813147150675976632"
Dim job As New Jobs.ImportZUGFeRDFiles(_logConfig, _firebird)
@ -138,7 +139,7 @@ Public Class Form1
End Function
Private Sub Button5_Click(sender As Object, e As EventArgs) Handles Button5.Click
Process.Start("\\dd-sto01\DD-STO01-A2\SharedObjects\Public\Projekte\Test\Import\ZUGFerD\Email_in")
Process.Start("\\dd-sto01\DD-DFSR01\SharedObjects\Public\Projekte\Test\Import\ZUGFerD\Email_in")
End Sub
Private Sub Button6_Click(sender As Object, e As EventArgs) Handles Button6.Click
@ -160,4 +161,15 @@ Public Class Form1
End Try
End If
End Sub
Private Sub Button7_Click(sender As Object, e As EventArgs) Handles Button7.Click
Dim oExtractor = New Jobs.PDFAttachments(_logConfig, "21182889975216572111813147150675976632")
Dim oResult = OpenFileDialog1.ShowDialog()
If oResult = DialogResult.OK Then
oExtractor.Extract(OpenFileDialog1.FileName, AllowedExtensions:=New List(Of String) From {"docx", "doc", "pdf", "xls", "xlsx", "ppt", "pptx", "txt"})
End If
End Sub
End Class

View File

@ -42,6 +42,9 @@ Public Class ImportZUGFeRDFiles
</ul></p>
"""
' List of allowed extensions for PDF/A Attachments
Private AllowedExtensions = New List(Of String) From {"docx", "doc", "pdf", "xls", "xlsx", "ppt", "pptx", "txt"}
Private _logger As Logger
Private _logConfig As LogConfig
Private _zugferd As ZUGFeRDInterface
@ -318,6 +321,7 @@ Public Class ImportZUGFeRDFiles
Public Sub Start(Arguments As Object) Implements IJob.Start
Dim oArgs As WorkerArgs = Arguments
Dim oPropertyExtractor = New PropertyValues(_logConfig)
Dim oAttachmentExtractor = New PDFAttachments(_logConfig, oArgs.GDPictureKey)
_logger.Debug("Starting Job {0}", [GetType].Name)
@ -404,10 +408,16 @@ Public Class ImportZUGFeRDFiles
_logger.Warn("Unexpected Error occurred while extracting ZUGFeRD Information from file {0}", oFile.FullName)
Throw ex
End Select
End Try
Dim oAttachments = oAttachmentExtractor.Extract(oFile.FullName, AllowedExtensions)
If oAttachments Is Nothing Then
_logger.Warn("Attachments for file [{0}] could not be extracted", oFile.FullName)
Else
oFileAttachmentFiles.AddRange(oFileGroupFiles)
oFileAttachmentFiles.AddRange(oAttachments)
End If
oMD5CheckSum = CreateMD5(oFile.FullName)
If oMD5CheckSum <> String.Empty Then
Dim oCheckCommand = $"SELECT * FROM TBEDM_ZUGFERD_HISTORY_IN WHERE GUID = (SELECT MAX(GUID) FROM TBEDM_ZUGFERD_HISTORY_IN WHERE UPPER(MD5HASH) = UPPER('{oMD5CheckSum}'))"

View File

@ -1,9 +1,84 @@
Public Class PDFAttachments
Public Sub New(GdPictureKey As String)
Imports System.Collections.Generic
Imports System.IO
Imports DigitalData.Modules.Logging
Imports GdPicture14
Public Class PDFAttachments
Private Logger As Logger
Private Const ZUGFERD_XML_FILENAME = "ZUGFeRD-invoice.xml"
Public Sub New(LogConfig As LogConfig, GdPictureKey As String)
Logger = LogConfig.GetLogger
End Sub
Public Shared Function Extract(FileName As String)
Using oGDPicturePDF As New GDPicturePDF
Public Function Extract(FileName As String, AllowedExtensions As List(Of String)) As List(Of FileInfo)
Dim oResults As New List(Of FileInfo)
Dim oExtensions = AllowedExtensions.ConvertAll(Of String)(New Converter(Of String, String)(Function(ext) ext.ToUpper))
Try
Using oGDPicturePDF As New GdPicturePDF()
If oGDPicturePDF.LoadFromFile(FileName, False) = GdPictureStatus.OK Then
Dim oEmbeddedFileCount As Integer = oGDPicturePDF.GetEmbeddedFileCount()
If oGDPicturePDF.GetStat() = GdPictureStatus.OK Then
If oEmbeddedFileCount > 1 Then
For index = 0 To oEmbeddedFileCount - 1
Dim oFileName As String = oGDPicturePDF.GetEmbeddedFileName(index)
If oGDPicturePDF.GetStat() = GdPictureStatus.OK Then
Dim oExtension = New FileInfo(oFileName).Extension.ToUpper.Substring(1)
If oFileName.ToUpper <> ZUGFERD_XML_FILENAME.ToUpper Then
If oExtensions.Contains(oExtension) Then
Dim FileSize As Integer = oGDPicturePDF.GetEmbeddedFileSize(index)
If oGDPicturePDF.GetStat() = GdPictureStatus.OK Then
Dim FileData As Byte() = New Byte(FileSize) {}
Dim status As GdPictureStatus = oGDPicturePDF.ExtractEmbeddedFile(index, FileData)
If status = GdPictureStatus.OK Then
Dim oTempName As String = Path.Combine(Path.GetTempPath(), oFileName)
Using oFileStream As New FileStream(oTempName, FileMode.OpenOrCreate)
oFileStream.Write(FileData, 0, FileData.Length)
End Using
oResults.Add(New FileInfo(oTempName))
Else
Logger.Error("The embedded file [{0}] has failed to extract. Status: {1}", oFileName, oGDPicturePDF.GetStat().ToString())
Continue For
End If
Else
Logger.Error("An error occurred getting the file size for [{0}]. Status: {1}", oFileName, oGDPicturePDF.GetStat().ToString())
Continue For
End If
Else
Logger.Warn("File [{0}] was skipped because its extension [{1}] is not allowed.", oFileName, oExtension)
Continue For
End If
Else
Logger.Debug("File [{0}] was skipped because its name indicates the invoice data file.", oFileName)
Continue For
End If
Else
Logger.Error("An error occurred getting the file name for [{0}]. Status: {1}", oFileName, oGDPicturePDF.GetStat().ToString())
Continue For
End If
Next
End If
Else
Logger.Error("An error occurred getting the number of embedded files. Status: {0}", oGDPicturePDF.GetStat().ToString())
Return Nothing
End If
Else
Logger.Error("The file [{0}] can't be loaded.", FileName)
Return Nothing
End If
End Using
Return oResults
Catch ex As Exception
Logger.Warn("Unexpected Error while Extracting attachments from File [{0}]", FileName)
Logger.Error(ex)
Return Nothing
End Try
End Function
End Class

View File

@ -9,6 +9,7 @@ Public Class WorkerArgs
Public AttachmentsSubDirectory As String
Public PropertyMap As Dictionary(Of String, XmlItemProperty)
Public InsertIntoSQLServer As Boolean
Public GDPictureKey As String
Public Sub New()
WatchDirectories = New List(Of String)
@ -19,5 +20,6 @@ Public Class WorkerArgs
AttachmentsSubDirectory = Nothing
PropertyMap = New Dictionary(Of String, XmlItemProperty)
InsertIntoSQLServer = False
GDPictureKey = String.Empty
End Sub
End Class

View File

@ -109,6 +109,9 @@
<Reference Include="FirebirdSql.Data.FirebirdClient, Version=6.4.0.0, Culture=neutral, PublicKeyToken=3750abcc3150b00c, processorArchitecture=MSIL">
<HintPath>..\packages\FirebirdSql.Data.FirebirdClient.6.4.0\lib\net452\FirebirdSql.Data.FirebirdClient.dll</HintPath>
</Reference>
<Reference Include="GdPicture.NET.14">
<HintPath>D:\ProgramFiles\GdPicture.NET 14\Redist\GdPicture.NET (.NET Framework 4.5)\GdPicture.NET.14.dll</HintPath>
</Reference>
<Reference Include="Microsoft.CSharp" />
<Reference Include="NLog, Version=4.0.0.0, Culture=neutral, PublicKeyToken=5120e14c03d0593c, processorArchitecture=MSIL">
<HintPath>..\packages\NLog.4.6.8\lib\net45\NLog.dll</HintPath>