ファイル判別、情報取得機能追加

This commit is contained in:
ou-yongtong 2025-10-28 09:29:53 +09:00
parent 022f6d61a1
commit abd9fb7753
7 changed files with 1562 additions and 16 deletions

View File

@ -0,0 +1,44 @@
using System.Runtime.InteropServices;
using System.Runtime.Versioning;
namespace InfoExtraction
{
[SupportedOSPlatform("windows")]
class ComWrapper<T> : IDisposable
{
public T ComObject { get; }
public ComWrapper(T comObject)
{
this.ComObject = comObject;
}
private bool disposedValue = false;
protected virtual void Dispose(bool disposing)
{
if (!disposedValue)
{
if (disposing)
{
//nop
}
if (ComObject != null)
{
Marshal.ReleaseComObject(ComObject);
}
disposedValue = true;
}
}
~ComWrapper()
{
Dispose(false);
}
public void Dispose()
{
Dispose(true);
}
}
}

View File

@ -2,14 +2,58 @@
<PropertyGroup> <PropertyGroup>
<OutputType>Exe</OutputType> <OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework> <TargetFramework>net8.0-windows10.0.22000.0</TargetFramework>
<RootNamespace>FY2526_SW_PoC_APIRelay</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings> <ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable> <Nullable>enable</Nullable>
</PropertyGroup> </PropertyGroup>
<ItemGroup>
<COMReference Include="Microsoft.Office.Core">
<WrapperTool>tlbimp</WrapperTool>
<VersionMinor>8</VersionMinor>
<VersionMajor>2</VersionMajor>
<Guid>2df8d04c-5bfa-101b-bde5-00aa0044de52</Guid>
<Lcid>0</Lcid>
<Isolated>false</Isolated>
<EmbedInteropTypes>true</EmbedInteropTypes>
</COMReference>
<COMReference Include="Microsoft.Office.Interop.Excel">
<WrapperTool>tlbimp</WrapperTool>
<VersionMinor>9</VersionMinor>
<VersionMajor>1</VersionMajor>
<Guid>00020813-0000-0000-c000-000000000046</Guid>
<Lcid>0</Lcid>
<Isolated>false</Isolated>
<EmbedInteropTypes>true</EmbedInteropTypes>
</COMReference>
<COMReference Include="Microsoft.Office.Interop.Word">
<WrapperTool>tlbimp</WrapperTool>
<VersionMinor>7</VersionMinor>
<VersionMajor>8</VersionMajor>
<Guid>00020905-0000-0000-c000-000000000046</Guid>
<Lcid>0</Lcid>
<Isolated>false</Isolated>
<EmbedInteropTypes>true</EmbedInteropTypes>
</COMReference>
<COMReference Include="Microsoft.Office.Interop.PowerPoint">
<WrapperTool>tlbimp</WrapperTool>
<VersionMinor>12</VersionMinor>
<VersionMajor>2</VersionMajor>
<Guid>91493440-5a91-11cf-8700-00aa0060263b</Guid>
<Lcid>0</Lcid>
<Isolated>false</Isolated>
<EmbedInteropTypes>true</EmbedInteropTypes>
</COMReference>
</ItemGroup>
<ItemGroup> <ItemGroup>
<PackageReference Include="log4net" Version="3.2.0" /> <PackageReference Include="log4net" Version="3.2.0" />
<PackageReference Include="Magick.NET-Q16-AnyCPU" Version="14.9.0" />
<PackageReference Include="Magick.NET.Core" Version="14.9.0" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.4" />
<PackageReference Include="PdfiumViewer" Version="2.13.0" />
<PackageReference Include="PdfiumViewer.Native.x86_64.v8-xfa" Version="2018.4.8.256" />
<PackageReference Include="System.Drawing.Common" Version="9.0.10" />
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>

View File

@ -0,0 +1,65 @@
using Microsoft.VisualBasic.FileIO;
using Newtonsoft.Json;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.Text;
using System.Text.Json.Nodes;
using System.Threading.Tasks;
using static FY2526_SW_PoC_APIRelay.LogWriter;
namespace FY2526_SW_PoC_APIRelay
{
internal class FileDiscrimination
{
private static readonly HttpClient httpClient = new HttpClient
{
Timeout = TimeSpan.FromSeconds(30)
};
private static readonly string qdrantUrl = "http://localhost:6333/collections/personalInformation/points";
public static FileType GetFileType(string filePath)
{
string extension = Path.GetExtension(filePath).ToLower();
return extension switch
{
".jpeg" or ".jpg" or ".png" or ".gif" or ".heic" => FileType.Image,
".txt" or ".pdf" or ".doc" or ".docx" or ".xls" or ".xlsx" or ".ppt" or ".pptx" => FileType.Document,
_ => FileType.Other,
};
}
public static async Task<bool> FileExistCheck(string filePath)
{
var requestData = new
{
filter = new
{
must = new[] { new { key = "filePath", match = new { value = filePath } } }
},
limit = 1
};
var json = JsonConvert.SerializeObject(requestData);
var httpContent = new StringContent(json, Encoding.UTF8, "application/json");
try
{
var response = await httpClient.PostAsync($"{qdrantUrl}/scroll", httpContent);
response.EnsureSuccessStatusCode();
string responseString = await response.Content.ReadAsStringAsync();
var jsonObject = JsonNode.Parse(responseString);
return jsonObject?["result"]?["points"]?.AsArray()?.Count > 0;
}
catch (Exception ex)
{
LogWriter.WriteLog($"ファイル確認エラー: {ex.Message}", LogWriter.LogLevel.ERROR);
throw new Exception("データベースにデータ取得エラー。");
}
}
}
}

View File

@ -0,0 +1,12 @@
namespace FY2526_SW_PoC_APIRelay
{
enum FileType
{
/// <summary>画像</summary>
Image,
/// <summary>ドキュメントファイル</summary>
Document,
/// <summary>その他</summary>
Other
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,123 @@
using FY2526_SW_PoC_APIRelay;
using ImageMagick;
using PretreatmentFile;
using System;
using System.IO;
using System.Threading.Tasks;
using Windows.Graphics.Imaging;
using Windows.Media.Ocr;
using Windows.Storage;
using Windows.Storage.Streams;
namespace PretreatmentFile
{
public class OcrHelper
{
/// <summary>OCR</summary>
/// <param name="filePath">ファイルパス</param>
/// <returns>読み取った文字列</returns>
///
public static async Task<string?> Ocr(string filePath)
{
try
{
string extension = Path.GetExtension(filePath).ToLower();
string? convertedFilePath = null;
if (extension == ".heic")
{
try
{
convertedFilePath = ConvertHeicToJpeg(filePath);
if (convertedFilePath == null)
{
LogWriter.WriteLog("HEICファイルの変換に失敗しました", LogWriter.LogLevel.ERROR);
return null;
}
filePath = convertedFilePath;
}
catch (Exception conversionException)
{
LogWriter.WriteLog($"HEICからJPEGへの変換中にエラーが発生しました: {conversionException.Message}", LogWriter.LogLevel.ERROR);
return null;
}
}
// ファイルを開き、BitmapDecoderを作成
var file = await StorageFile.GetFileFromPathAsync(filePath);
var stream = await file.OpenAsync(FileAccessMode.Read);
var decoder = await BitmapDecoder.CreateAsync(stream);
var bmp = await decoder.GetSoftwareBitmapAsync();
try
{
var engine = OcrEngine.TryCreateFromLanguage(new Windows.Globalization.Language("ja"));
if (engine != null)
{
var result = await engine.RecognizeAsync(bmp);
// OcrResult.Lines から各行のテキストを取得し、改行で結合
var extractedText = string.Join(Environment.NewLine, result.Lines.Select(line => line.Text));
if (convertedFilePath != null && File.Exists(convertedFilePath))
{
try
{
File.Delete(convertedFilePath); // JPEGファイルを削除
}
catch (Exception deleteException)
{
LogWriter.WriteLog($"JPEGファイルの削除に失敗しました: {deleteException.Message}", LogWriter.LogLevel.ERROR);
}
}
return extractedText;
}
else
{
LogWriter.WriteLog("OCRエンジンの作成に失敗しました", LogWriter.LogLevel.ERROR);
return null;
}
}
catch (Exception ocrException)
{
LogWriter.WriteLog($"OCR認識に失敗しました: {ocrException.Message}", LogWriter.LogLevel.ERROR);
return null;
}
}
catch (Exception e)
{
LogWriter.WriteLog($"予期しないエラーが発生しました: {e.Message}", LogWriter.LogLevel.ERROR);
return null;
}
}
/// <summary>heicをjpegに変換する</summary>
/// <param name="heicFilePath">heicファイルのパス</param>
/// <returns>変換後のjpegファイルのパス</returns>
///
private static string? ConvertHeicToJpeg(string heicFilePath)
{
try
{
// Magick.NETを使用してHEICファイルをJPEGに変換
string outputFilePath = heicFilePath.Replace(".heic", ".jpg", StringComparison.OrdinalIgnoreCase);
using (var image = new MagickImage(heicFilePath))
{
// JPEGとして保存
image.Format = MagickFormat.Jpeg;
image.Write(outputFilePath);
}
return outputFilePath;
}
catch (Exception e)
{
LogWriter.WriteLog($"HEICファイルの変換中にエラーが発生しました: {e.Message}", LogWriter.LogLevel.ERROR);
return null; // 変換に失敗した場合はnullを返す
}
}
}
}

View File

@ -1,5 +1,8 @@
namespace FY2526_SW_PoC_APIRelay using System.Reflection;
namespace FY2526_SW_PoC_APIRelay
{ {
internal class Program internal class Program
{ {
private enum ResultCode : int private enum ResultCode : int
@ -11,20 +14,125 @@
Error = 5000, Error = 5000,
ErrorReadFile = 5001, ErrorReadFile = 5001,
ErrorCleansing = 5002, ErrorCleansing = 5002,
FileAlreadyExists = 6001,
FileNotSupported = 6002
} }
/// <summary>メイン関数</summary> /// <summary>メイン関数</summary>
/// <param name="args">実行引数</param> /// <param name="args">実行引数</param>
[STAThread] [STAThread]
static void Main(string[] args) static async Task Main(string[] args)
{ {
LogWriter.WriteLog("PretreatmentFile 開始", LogWriter.LogLevel.INFO); LogWriter.WriteLog("11ファイル前処理 開始", LogWriter.LogLevel.INFO);
AppDomain.CurrentDomain.ProcessExit += (s, e) => CleanupPythonProcesses();
AppDomain.CurrentDomain.UnhandledException += (s, e) => CleanupPythonProcesses();
Console.CancelKeyPress += (sender, e) => CleanupPythonProcesses();
LogWriter.WriteDebugLog(string.Join(" , ", args)); try
{
// 入力パラメータチェック
if (args.Length != 2)
{
System.Console.Write("{0} Usage: FY2526-SW-PoC-APIRelay.exe \"ファイルパス\" \"特殊記号が記載されたCSVのパス\"", (int)ResultCode.BadParameter);
return;
}
System.Console.Write("{0}", (int)ResultCode.Success); string? filePath;
string? csvPath;
String? fileContent;
LogWriter.WriteLog("PretreatmentFile 終了", LogWriter.LogLevel.INFO); filePath = args[0];
csvPath = args[1];
//System.Console.Write(string.Format("入力パラメータ ファイルパス:{0} CSVパス:{1}", filePath, csvPath), LogWriter.LogLevel.DEBUG);
var fileType = FileDiscrimination.GetFileType(filePath);
if (fileType == FileType.Other)
{
LogWriter.WriteLog($"対応していないファイル形式です: {filePath}", LogWriter.LogLevel.INFO);
Console.WriteLine($"{(int)ResultCode.FileNotSupported}");
return;
}
Task<bool> isFileExist = FileDiscrimination.FileExistCheck(filePath);
if (isFileExist.Result)
{
LogWriter.WriteLog($"ファイル既に存在する: {filePath}", LogWriter.LogLevel.INFO);
Console.WriteLine($"{(int)ResultCode.FileAlreadyExists}");
return;
}
switch (fileType)
{
case FileType.Image:
fileContent = await GetDataFromFile.Ocr(filePath); // 非同期メソッドを呼び出し
Console.WriteLine($"{fileContent}");
// OCR
break;
case FileType.Document:
fileContent = GetDataFromFile.ReadDocument(filePath);
Console.WriteLine($"{fileContent}");
// 読み出し
break;
default:
LogWriter.WriteLog($"対応していないファイル形式です: {filePath}", LogWriter.LogLevel.INFO);
Console.WriteLine($"{(int)ResultCode.FileNotSupported}");
return;
}
if (fileContent == null)
{
System.Console.Write("{0}", (int)ResultCode.ErrorReadFile);
LogWriter.WriteLog($"ファイルの読み込みに失敗しました。結果コード:{(int)ResultCode.ErrorReadFile}", LogWriter.LogLevel.ERROR);
return;
}
// クレンジング
var cleansingText = GetDataFromFile.CleansingText(fileContent, csvPath);
if (cleansingText == null)
{
System.Console.Write("{0}", (int)ResultCode.ErrorCleansing);
LogWriter.WriteLog($"クレンジングに失敗しました。結果コード:{(int)ResultCode.ErrorCleansing}", LogWriter.LogLevel.ERROR);
return;
}
Console.Write($"{cleansingText}");
}
catch (Exception ex)
{
LogWriter.WriteLog($"実行エラー : {ex.Message}", LogWriter.LogLevel.ERROR);
Console.WriteLine($"{(int)ResultCode.Error}");
}
finally
{
CleanupPythonProcesses();
}
}
/// <summary>Pythonプロセスクリア</summary>
private static void CleanupPythonProcesses()
{
//foreach (var process in FY2526_SW_PoC_APIRelay.SearchInVectorDB.pythonProcesses.ToArray())
//{
// try
// {
// if (!process.HasExited)
// {
// LogWriter.WriteLog($"プロセスクリアされました", LogWriter.LogLevel.INFO);
// process.Kill();
// process.WaitForExit(1000);
// }
// }
// catch (Exception ex)
// {
// LogWriter.WriteLog($"プロセスクリアエラー発生: {ex.Message}", LogWriter.LogLevel.ERROR);
// }
// finally
// {
// process.Dispose();
// }
//}
//VectorSearch.SearchInVectorDB.pythonProcesses.Clear();
} }
} }
} }