From d9e0ed5ebca3fdff58869fab7eae761f1a4e0259 Mon Sep 17 00:00:00 2001 From: ou-yongtong Date: Wed, 29 Oct 2025 16:12:08 +0900 Subject: [PATCH] =?UTF-8?q?=E3=83=99=E3=82=AF=E3=83=88=E3=83=AB=E5=8C=96?= =?UTF-8?q?=E3=82=92=E8=BF=BD=E5=8A=A0=E3=80=81=E4=B8=8D=E8=A6=81=E3=81=AA?= =?UTF-8?q?=E3=83=95=E3=82=A1=E3=82=A4=E3=83=AB=E3=82=92=E5=89=8A=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 +- FY2526-SW-PoC-APIRelay/DataProcess.cs | 241 ++++++++++++++++++++++ FY2526-SW-PoC-APIRelay/GetDataFromFile.cs | 18 -- FY2526-SW-PoC-APIRelay/OcrHelper.cs | 123 ----------- FY2526-SW-PoC-APIRelay/Program.cs | 55 +++-- 5 files changed, 281 insertions(+), 159 deletions(-) create mode 100644 FY2526-SW-PoC-APIRelay/DataProcess.cs delete mode 100644 FY2526-SW-PoC-APIRelay/OcrHelper.cs diff --git a/.gitignore b/.gitignore index 9491a2f..206eb07 100644 --- a/.gitignore +++ b/.gitignore @@ -360,4 +360,5 @@ MigrationBackup/ .ionide/ # Fody - auto-generated XML schema -FodyWeavers.xsd \ No newline at end of file +FodyWeavers.xsd +/.lingma/rules/project_rule.md diff --git a/FY2526-SW-PoC-APIRelay/DataProcess.cs b/FY2526-SW-PoC-APIRelay/DataProcess.cs new file mode 100644 index 0000000..34292d1 --- /dev/null +++ b/FY2526-SW-PoC-APIRelay/DataProcess.cs @@ -0,0 +1,241 @@ +using FY2526_SW_PoC_APIRelay; +using Newtonsoft.Json; +using System; +using System.Collections; +using System.Collections.Generic; +using System.Diagnostics; +using System.Linq; +using System.Net.Http; +using System.Numerics; +using System.Reflection.Metadata; +using System.Text; +using System.Text.Json; +using System.Text.Json.Nodes; +using System.Threading.Tasks; + +namespace FY2526_SW_PoC_APIRelay +{ + public class SearchInVectorDB + { + private static readonly HttpClient httpClient = new HttpClient(); + private static readonly string qdrantUrl = "http://localhost:6333/collections/personalInformation/points"; + public static List pythonProcesses = new List(); + + /// メインエントリ:ドキュメントをベクトル化して、ベクトルデータベースに保存 + /// テキストファイルのパス + /// ファイルパス + /// 結果コード + public static async Task DocumentVectorization(string textFilePath, string filePath) + { + try + { + double[]? vector = await GenerateVector(textFilePath); + if (vector == null) + { + LogWriter.WriteLog($"Python実行エラー発生", LogWriter.LogLevel.ERROR); + Console.WriteLine($"{(int)ResultCode.PythonError}"); + return; + } + + string? content = GetTextFromFile(textFilePath); + if (content is null) + { + return; + } + if (await SaveToVectorDB(filePath, content, vector)) + { + LogWriter.WriteLog($"ベクトルデータベースに保存成功", LogWriter.LogLevel.INFO); + Console.WriteLine($"{(int)ResultCode.Success}"); + } + else + { + LogWriter.WriteLog($"ベクトルデータベースに保存失敗", LogWriter.LogLevel.ERROR); + Console.WriteLine($"{(int)ResultCode.Error}"); + } + } + catch (Exception ex) + { + LogWriter.WriteLog($"エラー発生: {ex.Message}", LogWriter.LogLevel.ERROR); + Console.WriteLine($"{(int)ResultCode.Error}"); + } + } + + /// Vector DB への保存 + /// テキストファイルのパス + /// コンテンツ + /// ベクトルデータ + /// データベースにコンテンツとベクトルを保存 + private static async Task SaveToVectorDB(string filePath, string content, double[] vector) + { + int newId = await GetNewId(); + if (newId == -1) return false; + + var point = new + { + points = new[] + { + new + { + id = newId, + payload = new { filePath, cluster = "/", indexID = "/", contents = content }, + vector + } + } + }; + + var json = JsonConvert.SerializeObject(point); + var httpContent = new StringContent(json, Encoding.UTF8, "application/json"); + + try + { + var response = await httpClient.PutAsync(qdrantUrl, httpContent); + string result = await response.Content.ReadAsStringAsync(); + return JsonNode.Parse(result)?["status"]?.ToString() == "ok"; + } + catch (Exception ex) + { + LogWriter.WriteLog($"Qdrantデータ保存エラー: {ex.Message}", LogWriter.LogLevel.ERROR); + return false; + } + } + + ///新ID取得 + ///新ID + ///ポイントの総件数を取得して、+1で返す + private static async Task GetNewId() + { + try + { + var response = await httpClient.PostAsync($"{qdrantUrl}/count", + new StringContent("{\"exact\": true}", Encoding.UTF8, "application/json")); + response.EnsureSuccessStatusCode(); + + var jsonDoc = JsonDocument.Parse(await response.Content.ReadAsStringAsync()); + return jsonDoc.RootElement.GetProperty("result").GetProperty("count").GetInt32() + 1; + } + catch (Exception ex) + { + LogWriter.WriteLog($"新ID取得エラー発生: {ex.Message}", LogWriter.LogLevel.ERROR); + return -1; + } + } + + /// Pythonスクリプトを呼び出してベクトルを生成 + /// テキストファイルのパス + /// ベクトルデータ + private static async Task GenerateVector(string textFilePath) + { + string? scriptPath; + string? exeDirectory; + string pythonPath = "python.exe"; + exeDirectory = AppContext.BaseDirectory; + if (exeDirectory != null) + { + scriptPath = Path.Combine(exeDirectory, "Python", "FileVectorization.py"); + } + else + { + LogWriter.WriteLog($"Pythonファイルのパスが間違っています", LogWriter.LogLevel.ERROR); + return null; + } + + var start = new ProcessStartInfo + { + FileName = pythonPath, + RedirectStandardOutput = true, + RedirectStandardError = true, + UseShellExecute = false, + CreateNoWindow = true, + StandardOutputEncoding = Encoding.UTF8, + StandardErrorEncoding = Encoding.UTF8, + }; + start.ArgumentList.Add(scriptPath); + start.ArgumentList.Add(textFilePath); + + try + { + using var process = Process.Start(start); + if (process == null) + { + LogWriter.WriteLog("Pythonプロセスの開始に失敗しました", LogWriter.LogLevel.ERROR); + return null; + } + string output = await process.StandardOutput.ReadToEndAsync(); + string error = await process.StandardError.ReadToEndAsync(); + + await process.WaitForExitAsync(); + + if (!string.IsNullOrEmpty(error)) + { + LogWriter.WriteLog($"Python実行エラー: {error}", LogWriter.LogLevel.ERROR); + return null; + } + LogWriter.WriteLog($"Python実行成功、ファイルベクトル化成功", LogWriter.LogLevel.INFO); + using var documet = JsonDocument.Parse(output); + var root = documet.RootElement; + + string? value = root.TryGetProperty("value", out var valueElement) ? valueElement.GetString() : null; + string? key = root.TryGetProperty("key", out var keyElement) ? keyElement.GetString() : null; + Console.WriteLine($"{key}"); + + if (value != null) + { + return ConvertStringToDouble(value); + } + else + { + LogWriter.WriteLog($"Python実行エラー、PythonからのJSONデータ解析失敗", LogWriter.LogLevel.ERROR); + return null; + } + } + catch (Exception ex) + { + LogWriter.WriteLog($"Pythonスクリプト実行エラー: {ex.Message}", LogWriter.LogLevel.ERROR); + return null; + } + } + + /// 文字列をdouble型の配列に変換 + /// テキストベクトルデータ + /// 文字列をdouble配列に変換 + private static double[]? ConvertStringToDouble(string vectorString) + { + try + { + return vectorString + .Trim('[', ']') + .Split(',') + .Select(s => double.Parse(s.Trim())) + .ToArray(); + } + catch (Exception ex) + { + LogWriter.WriteLog($"Python出力内容が異常、ベクトル変換エラー: {ex.Message}", LogWriter.LogLevel.ERROR); + return null; + } + } + + /// ファイルからテキストを抽出 + /// テキストファイルのパス + /// テキストのコンテンツ + private static string? GetTextFromFile(string filePath) + { + try + { + return File.ReadAllText(filePath); + } + catch (FileNotFoundException ex) + { + LogWriter.WriteLog($"ファイル存在しない: {ex.Message}", LogWriter.LogLevel.ERROR); + Console.WriteLine($"{(int)ResultCode.FileNotFound}"); + return null; + } + catch (Exception ex) + { + LogWriter.WriteLog($"ファイル読み込みエラー: {ex.Message}", LogWriter.LogLevel.ERROR); + Console.WriteLine($"{(int)ResultCode.ErrorReadFile}"); + return null; + } + } + } +} diff --git a/FY2526-SW-PoC-APIRelay/GetDataFromFile.cs b/FY2526-SW-PoC-APIRelay/GetDataFromFile.cs index 12c1247..4db643f 100644 --- a/FY2526-SW-PoC-APIRelay/GetDataFromFile.cs +++ b/FY2526-SW-PoC-APIRelay/GetDataFromFile.cs @@ -1,29 +1,11 @@ using ImageMagick; using InfoExtraction; using Microsoft.Office.Interop.Excel; -using Microsoft.Office.Interop.Excel; -using Microsoft.Office.Interop.Excel; using Microsoft.Office.Interop.PowerPoint; -using Microsoft.Office.Interop.PowerPoint; -using Microsoft.Office.Interop.PowerPoint; -using Microsoft.Office.Interop.Word; -using Microsoft.Office.Interop.Word; using Microsoft.Office.Interop.Word; using PdfiumViewer; -using PdfiumViewer; -using PretreatmentFile; -using System; -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Runtime.InteropServices; -using System.Runtime.Versioning; -using System.Text; using System.Text; using System.Text.RegularExpressions; -using System.Threading.Tasks; -using Windows.ApplicationModel.Core; using Windows.Graphics.Imaging; using Windows.Media.Ocr; using Windows.Storage; diff --git a/FY2526-SW-PoC-APIRelay/OcrHelper.cs b/FY2526-SW-PoC-APIRelay/OcrHelper.cs deleted file mode 100644 index 59d9e56..0000000 --- a/FY2526-SW-PoC-APIRelay/OcrHelper.cs +++ /dev/null @@ -1,123 +0,0 @@ -using FY2526_SW_PoC_APIRelay; -using ImageMagick; -using PretreatmentFile; -using System; -using System.IO; -using System.Threading.Tasks; -using Windows.Graphics.Imaging; -using Windows.Media.Ocr; -using Windows.Storage; -using Windows.Storage.Streams; - -namespace PretreatmentFile -{ - public class OcrHelper - { - /// OCR - /// ファイルパス - /// 読み取った文字列 - /// - public static async Task Ocr(string filePath) - { - try - { - string extension = Path.GetExtension(filePath).ToLower(); - string? convertedFilePath = null; - - if (extension == ".heic") - { - try - { - convertedFilePath = ConvertHeicToJpeg(filePath); - if (convertedFilePath == null) - { - LogWriter.WriteLog("HEICファイルの変換に失敗しました", LogWriter.LogLevel.ERROR); - return null; - } - filePath = convertedFilePath; - } - catch (Exception conversionException) - { - LogWriter.WriteLog($"HEICからJPEGへの変換中にエラーが発生しました: {conversionException.Message}", LogWriter.LogLevel.ERROR); - return null; - } - } - - // ファイルを開き、BitmapDecoderを作成 - var file = await StorageFile.GetFileFromPathAsync(filePath); - var stream = await file.OpenAsync(FileAccessMode.Read); - var decoder = await BitmapDecoder.CreateAsync(stream); - var bmp = await decoder.GetSoftwareBitmapAsync(); - - try - { - var engine = OcrEngine.TryCreateFromLanguage(new Windows.Globalization.Language("ja")); - if (engine != null) - { - var result = await engine.RecognizeAsync(bmp); - - // OcrResult.Lines から各行のテキストを取得し、改行で結合 - var extractedText = string.Join(Environment.NewLine, result.Lines.Select(line => line.Text)); - - if (convertedFilePath != null && File.Exists(convertedFilePath)) - { - try - { - File.Delete(convertedFilePath); // JPEGファイルを削除 - } - catch (Exception deleteException) - { - LogWriter.WriteLog($"JPEGファイルの削除に失敗しました: {deleteException.Message}", LogWriter.LogLevel.ERROR); - } - } - - return extractedText; - } - else - { - LogWriter.WriteLog("OCRエンジンの作成に失敗しました", LogWriter.LogLevel.ERROR); - return null; - } - } - catch (Exception ocrException) - { - LogWriter.WriteLog($"OCR認識に失敗しました: {ocrException.Message}", LogWriter.LogLevel.ERROR); - return null; - } - } - catch (Exception e) - { - LogWriter.WriteLog($"予期しないエラーが発生しました: {e.Message}", LogWriter.LogLevel.ERROR); - return null; - } - } - - - /// heicをjpegに変換する - /// heicファイルのパス - /// 変換後のjpegファイルのパス - /// - private static string? ConvertHeicToJpeg(string heicFilePath) - { - try - { - // Magick.NETを使用してHEICファイルをJPEGに変換 - string outputFilePath = heicFilePath.Replace(".heic", ".jpg", StringComparison.OrdinalIgnoreCase); - - using (var image = new MagickImage(heicFilePath)) - { - // JPEGとして保存 - image.Format = MagickFormat.Jpeg; - image.Write(outputFilePath); - } - - return outputFilePath; - } - catch (Exception e) - { - LogWriter.WriteLog($"HEICファイルの変換中にエラーが発生しました: {e.Message}", LogWriter.LogLevel.ERROR); - return null; // 変換に失敗した場合はnullを返す - } - } - } -} \ No newline at end of file diff --git a/FY2526-SW-PoC-APIRelay/Program.cs b/FY2526-SW-PoC-APIRelay/Program.cs index b56dee2..e4668f6 100644 --- a/FY2526-SW-PoC-APIRelay/Program.cs +++ b/FY2526-SW-PoC-APIRelay/Program.cs @@ -2,28 +2,35 @@ namespace FY2526_SW_PoC_APIRelay { - + /// 結果コード + public enum ResultCode : int + { + Success = 2000, + BadParameter = 4000, + FileNotFound = 4004, + NotTarget = 4005, + Error = 5000, + ErrorReadFile = 5001, + ErrorCleansing = 5002, + ErrorSavingFile = 5003, + FileAlreadyExists = 6001, + FileNotSupported = 6002, + PythonError = 7001, + ModelLoadFailed = 7002, + ModelLoadSuccess = 7003, + PythonEndSuccess = 7004, + PythonInputError = 7005, + QdrantNoData = 8001, + QdrantError = 8002 + } internal class Program { - private enum ResultCode : int - { - Success = 2000, - BadParameter = 4000, - FileNotFound = 4004, - NotTarget = 4005, - Error = 5000, - ErrorReadFile = 5001, - ErrorCleansing = 5002, - FileAlreadyExists = 6001, - FileNotSupported = 6002 - } - /// メイン関数 /// 実行引数 [STAThread] static async Task Main(string[] args) { - LogWriter.WriteLog("11ファイル前処理 開始", LogWriter.LogLevel.INFO); + LogWriter.WriteLog("ファイル前処理 開始", LogWriter.LogLevel.INFO); AppDomain.CurrentDomain.ProcessExit += (s, e) => CleanupPythonProcesses(); AppDomain.CurrentDomain.UnhandledException += (s, e) => CleanupPythonProcesses(); Console.CancelKeyPress += (sender, e) => CleanupPythonProcesses(); @@ -69,7 +76,6 @@ namespace FY2526_SW_PoC_APIRelay break; case FileType.Document: fileContent = GetDataFromFile.ReadDocument(filePath); - Console.WriteLine($"{fileContent}"); // 読み出し break; default: @@ -94,8 +100,23 @@ namespace FY2526_SW_PoC_APIRelay return; } + // クレンジング後のテキストを保存 + string outputFilePath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "cleansed_text.txt"); // 実行ファイルが存在するディレクトリを取得 + try + { + File.WriteAllText(outputFilePath, cleansingText); + LogWriter.WriteLog($"クレンジング後のテキストを {outputFilePath} に保存しました", LogWriter.LogLevel.INFO); + } + catch (Exception ex) + { + System.Console.Write("{0}", (int)ResultCode.ErrorSavingFile); + LogWriter.WriteLog($"テキストの保存に失敗しました: {ex.Message} 結果コード:{(int)ResultCode.ErrorSavingFile}", LogWriter.LogLevel.ERROR); + return; + } + + await SearchInVectorDB.DocumentVectorization(outputFilePath, filePath); + - Console.Write($"{cleansingText}"); } catch (Exception ex) {