8
0
Skriptentwickung/archive/Get-PDFContentAndWriteToDB/Get-PDFContentAndWriteToDB.ps1
2024-01-24 16:42:38 +01:00

219 lines
6.3 KiB
PowerShell
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#PowerShell 4.0 Script
#Digital Data
#Ludwig-Rinn-Strasse 16
#35452 Heuchelheim
#Tel.: 0641 / 202360
#E-Mail: info@digitaldata.works
#Version Number 1.0.0.0
#Version Date 20.11.2020
#Requires Version 4.0
#-----------------------------------------------------------------------------------------------------#
######################################## check for arguments ##########################################
#-----------------------------------------------------------------------------------------------------#
#-----------------------------------------------------------------------------------------------------#
################################## add additional buildin assemblys ###################################
#-----------------------------------------------------------------------------------------------------#
#-----------------------------------------------------------------------------------------------------#
############################################ set variables ############################################
#-----------------------------------------------------------------------------------------------------#
[string]$PDFExtractorDLL = "E:\itextsharp\itextsharp.dll"
[string]$SourcePath = "E:\itextsharp\in"
[string]$ArchivPath = "E:\itextsharp\out"
[string]$ErrorPath = "E:\itextsharp\error"
[string]$SourceFileExtension = "*.pdf"
[string]$DBSQLConnectServer = "172.24.12.41\tests"
[string]$DBSQLConnectDatabase = "DD_Test"
[string]$DBSQLConnectUser = "sa"
[string]$DBSQLConnectPassword = "dd"
[string]$DBSQLConnectIntegratedSecurity = $false
[string]$DBSQLQueryHead = "INSERT INTO [itextsharp] (FILENAME, FILECONTENT)"
[int]$Counter = 0
#-----------------------------------------------------------------------------------------------------#
########################################### preparing part ############################################
#-----------------------------------------------------------------------------------------------------#
#Clear Console Content
Clear-Host
#get all relevant files
[Array]$Items = Get-ChildItem -Path $SourcePath -Filter $SourceFileExtension
#-----------------------------------------------------------------------------------------------------#
############################################# main part ###############################################
#-----------------------------------------------------------------------------------------------------#
IF ($Items.Count -ge 1) {
Write-Host "Found" $Items.Count $SourceFileExtension.Replace("*.","")"Files!"
TRY {
$DBSQLConnection = New-Object System.Data.SqlClient.SqlConnection
$DBSQLConnection.ConnectionString = "Server = $DBSQLConnectServer; uid=$DBSQLConnectUser; pwd=$DBSQLConnectPassword; Database = $DBSQLConnectDatabase; Integrated Security = $DBSQLConnectIntegratedSecurity"
$DBSQLCommand = New-Object System.Data.SqlClient.SqlCommand
$DBSQLCommand.Connection = $DBSQLConnection
} #end try
CATCH {
Write-Host "Cannot prepare db connect!"
Write-Host $DBSQLConnectServer
Write-Host $DBSQLConnectDatabase
Write-Host $DBSQLConnectUser
Write-Host $DBSQLConnectPassword
Write-Host $DBSQLConnectIntegratedSecurity
Write-Host $DBSQLQueryHead
Write-Error $Error[0]
Remove-Variable * -ErrorAction SilentlyContinue
$Error.Clear()
EXIT
} #end catch
FOREACH ($Item in $Items) {
[int]$Counter++ | Out-Null
Write-Host ""
Write-Host "================================="
Write-Host "---------------------------------"
Write-Host "Item" $Counter "of" $Items.count
Write-Host "Processing File:" $Item
TRY {
$PDFExtractJob = Start-Job -ScriptBlock {
$PDFExtractorDLL = $args[0]
$ItemFullName = $args[1]
Add-Type -path $PDFExtractorDLL
$PDFReader = [iTextSharp.text.pdf.parser.PdfTextExtractor]
$PDFExtract = $NULL
$PDFExtract = $PDFReader::GetTextFromPage($ItemFullName,1)
$PDFExtract = $PDFExtract.TrimStart()
$PDFExtract = $PDFExtract.TrimEnd()
$PDFExtract = $PDFExtract.Replace("'","")
$PDFExtract = $PDFExtract.Replace('"','')
$PDFExtract = $PDFExtract.Replace("","")
$PDFExtract = $PDFExtract.Replace("","")
$PDFExtract = $PDFExtract.Replace("·","")
Write-Output $PDFExtract
} -ArgumentList $PDFExtractorDLL, $($Item.FullName)
$PDFExtract = Receive-Job -Job $PDFExtractJob -Wait
} #end try
CATCH {
Write-Host "Cannot read file fulltext!"
Write-Error $Error[0]
Remove-Variable * -ErrorAction SilentlyContinue
$Error.Clear()
} #end catch
IF ($PDFExtract.Length -gt 20) {
Write-Host "File has a valid Fulltext!"
TRY {
$DBSQLQuery = $NULL
$DBSQLQuery = $DBSQLQueryHead
$DBSQLQuery = $DBSQLQuery + "VALUES ('$($item.BaseName)', '$PDFExtract');"
Write-Host "Executing SQL Query..."
$DBSQLCommand = New-Object System.Data.SqlClient.SqlCommand
$DBSQLCommand.Connection = $DBSQLConnection
$DBSQLCommand.CommandText = $DBSQLQuery
$DBSQLConnection.Open()
$DBSQLCommand.ExecuteNonQuery() | Out-Null
$DBSQLConnection.Close()
Move-Item -Path $($Item.fullname) -Destination $ArchivPath -Force
Write-Host "... done!"
} #end try
CATCH {
$LastErrorQuery = $DBSQLQuery
Move-Item -path $($Item.fullname) -Destination $ErrorPath -Force
$DBSQLConnection.Close()
Write-Host "Cannot insert to db!"
Write-Host "Moving file to error path."
Write-Error $Error[0]
$Error.Clear()
} #end catch
Write-Host "---------------------------------"
Write-Host "================================="
} #end if
ELSE {
Write-Host "File has no or an invalid Fulltext!"
} #end else
} #end foreach
} #end of
ELSE {
Write-Host "No files to process!"
} #end else
Remove-Variable * -ErrorAction SilentlyContinue
$Error.Clear()
#Sources:
#https://vasol.eu/working-with-pdfs-with-powershell-in-run-net-script-activites/
#Create SQL TB
#SET ANSI_NULLS ON
#GO
#
#SET QUOTED_IDENTIFIER ON
#GO
#
#CREATE TABLE [dbo].[itextsharp](
# [GUID] [bigint] IDENTITY(1,1) NOT NULL,
# [FILENAME] [varchar](50) NOT NULL,
# [FILECONTENT] [varchar](max) NOT NULL,
# [STATE] [int] NULL,
# [ADDED_WHEN] [datetime] NULL
#) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY]
#GO
#
#ALTER TABLE [dbo].[itextsharp] ADD CONSTRAINT [DF_itextsharp_ADDED_WHEN] DEFAULT (getdate()) FOR [ADDED_WHEN]
#GO