ベクトル化を追加、不要なファイルを削除

This commit is contained in:
ou-yongtong 2025-10-29 16:12:08 +09:00
parent abd9fb7753
commit d9e0ed5ebc
5 changed files with 281 additions and 159 deletions

1
.gitignore vendored
View File

@ -361,3 +361,4 @@ MigrationBackup/
# Fody - auto-generated XML schema # Fody - auto-generated XML schema
FodyWeavers.xsd FodyWeavers.xsd
/.lingma/rules/project_rule.md

View File

@ -0,0 +1,241 @@
using FY2526_SW_PoC_APIRelay;
using Newtonsoft.Json;
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Net.Http;
using System.Numerics;
using System.Reflection.Metadata;
using System.Text;
using System.Text.Json;
using System.Text.Json.Nodes;
using System.Threading.Tasks;
namespace FY2526_SW_PoC_APIRelay
{
public class SearchInVectorDB
{
private static readonly HttpClient httpClient = new HttpClient();
private static readonly string qdrantUrl = "http://localhost:6333/collections/personalInformation/points";
public static List<Process> pythonProcesses = new List<Process>();
/// <summary>メインエントリ:ドキュメントをベクトル化して、ベクトルデータベースに保存</summary>
/// <param name="textFilePath">テキストファイルのパス</param>
/// <param name="filePath">ファイルパス</param>
/// <returns>結果コード</returns>
public static async Task DocumentVectorization(string textFilePath, string filePath)
{
try
{
double[]? vector = await GenerateVector(textFilePath);
if (vector == null)
{
LogWriter.WriteLog($"Python実行エラー発生", LogWriter.LogLevel.ERROR);
Console.WriteLine($"{(int)ResultCode.PythonError}");
return;
}
string? content = GetTextFromFile(textFilePath);
if (content is null)
{
return;
}
if (await SaveToVectorDB(filePath, content, vector))
{
LogWriter.WriteLog($"ベクトルデータベースに保存成功", LogWriter.LogLevel.INFO);
Console.WriteLine($"{(int)ResultCode.Success}");
}
else
{
LogWriter.WriteLog($"ベクトルデータベースに保存失敗", LogWriter.LogLevel.ERROR);
Console.WriteLine($"{(int)ResultCode.Error}");
}
}
catch (Exception ex)
{
LogWriter.WriteLog($"エラー発生: {ex.Message}", LogWriter.LogLevel.ERROR);
Console.WriteLine($"{(int)ResultCode.Error}");
}
}
/// <summary>Vector DB への保存</summary>
/// <param name="filePath">テキストファイルのパス</param>
/// <param name="content">コンテンツ</param>
/// <param name="vector">ベクトルデータ</param>
/// <remarks>データベースにコンテンツとベクトルを保存</remarks>
private static async Task<bool> SaveToVectorDB(string filePath, string content, double[] vector)
{
int newId = await GetNewId();
if (newId == -1) return false;
var point = new
{
points = new[]
{
new
{
id = newId,
payload = new { filePath, cluster = "/", indexID = "/", contents = content },
vector
}
}
};
var json = JsonConvert.SerializeObject(point);
var httpContent = new StringContent(json, Encoding.UTF8, "application/json");
try
{
var response = await httpClient.PutAsync(qdrantUrl, httpContent);
string result = await response.Content.ReadAsStringAsync();
return JsonNode.Parse(result)?["status"]?.ToString() == "ok";
}
catch (Exception ex)
{
LogWriter.WriteLog($"Qdrantデータ保存エラー: {ex.Message}", LogWriter.LogLevel.ERROR);
return false;
}
}
///<summary>新ID取得</summary>
///<returns>新ID</returns>
///<remarks>ポイントの総件数を取得して、+1で返す</remarks>
private static async Task<int> GetNewId()
{
try
{
var response = await httpClient.PostAsync($"{qdrantUrl}/count",
new StringContent("{\"exact\": true}", Encoding.UTF8, "application/json"));
response.EnsureSuccessStatusCode();
var jsonDoc = JsonDocument.Parse(await response.Content.ReadAsStringAsync());
return jsonDoc.RootElement.GetProperty("result").GetProperty("count").GetInt32() + 1;
}
catch (Exception ex)
{
LogWriter.WriteLog($"新ID取得エラー発生: {ex.Message}", LogWriter.LogLevel.ERROR);
return -1;
}
}
/// <summary>Pythonスクリプトを呼び出してベクトルを生成</summary>
/// <param name="textFilePath">テキストファイルのパス</param>
/// <returns>ベクトルデータ</returns>
private static async Task<double[]?> GenerateVector(string textFilePath)
{
string? scriptPath;
string? exeDirectory;
string pythonPath = "python.exe";
exeDirectory = AppContext.BaseDirectory;
if (exeDirectory != null)
{
scriptPath = Path.Combine(exeDirectory, "Python", "FileVectorization.py");
}
else
{
LogWriter.WriteLog($"Pythonファイルのパスが間違っています", LogWriter.LogLevel.ERROR);
return null;
}
var start = new ProcessStartInfo
{
FileName = pythonPath,
RedirectStandardOutput = true,
RedirectStandardError = true,
UseShellExecute = false,
CreateNoWindow = true,
StandardOutputEncoding = Encoding.UTF8,
StandardErrorEncoding = Encoding.UTF8,
};
start.ArgumentList.Add(scriptPath);
start.ArgumentList.Add(textFilePath);
try
{
using var process = Process.Start(start);
if (process == null)
{
LogWriter.WriteLog("Pythonプロセスの開始に失敗しました", LogWriter.LogLevel.ERROR);
return null;
}
string output = await process.StandardOutput.ReadToEndAsync();
string error = await process.StandardError.ReadToEndAsync();
await process.WaitForExitAsync();
if (!string.IsNullOrEmpty(error))
{
LogWriter.WriteLog($"Python実行エラー: {error}", LogWriter.LogLevel.ERROR);
return null;
}
LogWriter.WriteLog($"Python実行成功、ファイルベクトル化成功", LogWriter.LogLevel.INFO);
using var documet = JsonDocument.Parse(output);
var root = documet.RootElement;
string? value = root.TryGetProperty("value", out var valueElement) ? valueElement.GetString() : null;
string? key = root.TryGetProperty("key", out var keyElement) ? keyElement.GetString() : null;
Console.WriteLine($"{key}");
if (value != null)
{
return ConvertStringToDouble(value);
}
else
{
LogWriter.WriteLog($"Python実行エラー、PythonからのJSONデータ解析失敗", LogWriter.LogLevel.ERROR);
return null;
}
}
catch (Exception ex)
{
LogWriter.WriteLog($"Pythonスクリプト実行エラー: {ex.Message}", LogWriter.LogLevel.ERROR);
return null;
}
}
/// <summary>文字列をdouble型の配列に変換</summary>
/// <param name="vectorString">テキストベクトルデータ</param>
/// <remarks>文字列をdouble配列に変換</remarks>
private static double[]? ConvertStringToDouble(string vectorString)
{
try
{
return vectorString
.Trim('[', ']')
.Split(',')
.Select(s => double.Parse(s.Trim()))
.ToArray();
}
catch (Exception ex)
{
LogWriter.WriteLog($"Python出力内容が異常、ベクトル変換エラー: {ex.Message}", LogWriter.LogLevel.ERROR);
return null;
}
}
/// <summary>ファイルからテキストを抽出</summary>
/// <param name="filePath">テキストファイルのパス</param>
/// <returns>テキストのコンテンツ</returns>
private static string? GetTextFromFile(string filePath)
{
try
{
return File.ReadAllText(filePath);
}
catch (FileNotFoundException ex)
{
LogWriter.WriteLog($"ファイル存在しない: {ex.Message}", LogWriter.LogLevel.ERROR);
Console.WriteLine($"{(int)ResultCode.FileNotFound}");
return null;
}
catch (Exception ex)
{
LogWriter.WriteLog($"ファイル読み込みエラー: {ex.Message}", LogWriter.LogLevel.ERROR);
Console.WriteLine($"{(int)ResultCode.ErrorReadFile}");
return null;
}
}
}
}

View File

@ -1,29 +1,11 @@
using ImageMagick; using ImageMagick;
using InfoExtraction; using InfoExtraction;
using Microsoft.Office.Interop.Excel; using Microsoft.Office.Interop.Excel;
using Microsoft.Office.Interop.Excel;
using Microsoft.Office.Interop.Excel;
using Microsoft.Office.Interop.PowerPoint; using Microsoft.Office.Interop.PowerPoint;
using Microsoft.Office.Interop.PowerPoint;
using Microsoft.Office.Interop.PowerPoint;
using Microsoft.Office.Interop.Word;
using Microsoft.Office.Interop.Word;
using Microsoft.Office.Interop.Word; using Microsoft.Office.Interop.Word;
using PdfiumViewer; using PdfiumViewer;
using PdfiumViewer;
using PretreatmentFile;
using System;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Runtime.InteropServices;
using System.Runtime.Versioning;
using System.Text;
using System.Text; using System.Text;
using System.Text.RegularExpressions; using System.Text.RegularExpressions;
using System.Threading.Tasks;
using Windows.ApplicationModel.Core;
using Windows.Graphics.Imaging; using Windows.Graphics.Imaging;
using Windows.Media.Ocr; using Windows.Media.Ocr;
using Windows.Storage; using Windows.Storage;

View File

@ -1,123 +0,0 @@
using FY2526_SW_PoC_APIRelay;
using ImageMagick;
using PretreatmentFile;
using System;
using System.IO;
using System.Threading.Tasks;
using Windows.Graphics.Imaging;
using Windows.Media.Ocr;
using Windows.Storage;
using Windows.Storage.Streams;
namespace PretreatmentFile
{
public class OcrHelper
{
/// <summary>OCR</summary>
/// <param name="filePath">ファイルパス</param>
/// <returns>読み取った文字列</returns>
///
public static async Task<string?> Ocr(string filePath)
{
try
{
string extension = Path.GetExtension(filePath).ToLower();
string? convertedFilePath = null;
if (extension == ".heic")
{
try
{
convertedFilePath = ConvertHeicToJpeg(filePath);
if (convertedFilePath == null)
{
LogWriter.WriteLog("HEICファイルの変換に失敗しました", LogWriter.LogLevel.ERROR);
return null;
}
filePath = convertedFilePath;
}
catch (Exception conversionException)
{
LogWriter.WriteLog($"HEICからJPEGへの変換中にエラーが発生しました: {conversionException.Message}", LogWriter.LogLevel.ERROR);
return null;
}
}
// ファイルを開き、BitmapDecoderを作成
var file = await StorageFile.GetFileFromPathAsync(filePath);
var stream = await file.OpenAsync(FileAccessMode.Read);
var decoder = await BitmapDecoder.CreateAsync(stream);
var bmp = await decoder.GetSoftwareBitmapAsync();
try
{
var engine = OcrEngine.TryCreateFromLanguage(new Windows.Globalization.Language("ja"));
if (engine != null)
{
var result = await engine.RecognizeAsync(bmp);
// OcrResult.Lines から各行のテキストを取得し、改行で結合
var extractedText = string.Join(Environment.NewLine, result.Lines.Select(line => line.Text));
if (convertedFilePath != null && File.Exists(convertedFilePath))
{
try
{
File.Delete(convertedFilePath); // JPEGファイルを削除
}
catch (Exception deleteException)
{
LogWriter.WriteLog($"JPEGファイルの削除に失敗しました: {deleteException.Message}", LogWriter.LogLevel.ERROR);
}
}
return extractedText;
}
else
{
LogWriter.WriteLog("OCRエンジンの作成に失敗しました", LogWriter.LogLevel.ERROR);
return null;
}
}
catch (Exception ocrException)
{
LogWriter.WriteLog($"OCR認識に失敗しました: {ocrException.Message}", LogWriter.LogLevel.ERROR);
return null;
}
}
catch (Exception e)
{
LogWriter.WriteLog($"予期しないエラーが発生しました: {e.Message}", LogWriter.LogLevel.ERROR);
return null;
}
}
/// <summary>heicをjpegに変換する</summary>
/// <param name="heicFilePath">heicファイルのパス</param>
/// <returns>変換後のjpegファイルのパス</returns>
///
private static string? ConvertHeicToJpeg(string heicFilePath)
{
try
{
// Magick.NETを使用してHEICファイルをJPEGに変換
string outputFilePath = heicFilePath.Replace(".heic", ".jpg", StringComparison.OrdinalIgnoreCase);
using (var image = new MagickImage(heicFilePath))
{
// JPEGとして保存
image.Format = MagickFormat.Jpeg;
image.Write(outputFilePath);
}
return outputFilePath;
}
catch (Exception e)
{
LogWriter.WriteLog($"HEICファイルの変換中にエラーが発生しました: {e.Message}", LogWriter.LogLevel.ERROR);
return null; // 変換に失敗した場合はnullを返す
}
}
}
}

View File

@ -2,10 +2,8 @@
namespace FY2526_SW_PoC_APIRelay namespace FY2526_SW_PoC_APIRelay
{ {
/// <summary>結果コード</summary>
internal class Program public enum ResultCode : int
{
private enum ResultCode : int
{ {
Success = 2000, Success = 2000,
BadParameter = 4000, BadParameter = 4000,
@ -14,16 +12,25 @@ namespace FY2526_SW_PoC_APIRelay
Error = 5000, Error = 5000,
ErrorReadFile = 5001, ErrorReadFile = 5001,
ErrorCleansing = 5002, ErrorCleansing = 5002,
ErrorSavingFile = 5003,
FileAlreadyExists = 6001, FileAlreadyExists = 6001,
FileNotSupported = 6002 FileNotSupported = 6002,
PythonError = 7001,
ModelLoadFailed = 7002,
ModelLoadSuccess = 7003,
PythonEndSuccess = 7004,
PythonInputError = 7005,
QdrantNoData = 8001,
QdrantError = 8002
} }
internal class Program
{
/// <summary>メイン関数</summary> /// <summary>メイン関数</summary>
/// <param name="args">実行引数</param> /// <param name="args">実行引数</param>
[STAThread] [STAThread]
static async Task Main(string[] args) static async Task Main(string[] args)
{ {
LogWriter.WriteLog("11ファイル前処理 開始", LogWriter.LogLevel.INFO); LogWriter.WriteLog("ファイル前処理 開始", LogWriter.LogLevel.INFO);
AppDomain.CurrentDomain.ProcessExit += (s, e) => CleanupPythonProcesses(); AppDomain.CurrentDomain.ProcessExit += (s, e) => CleanupPythonProcesses();
AppDomain.CurrentDomain.UnhandledException += (s, e) => CleanupPythonProcesses(); AppDomain.CurrentDomain.UnhandledException += (s, e) => CleanupPythonProcesses();
Console.CancelKeyPress += (sender, e) => CleanupPythonProcesses(); Console.CancelKeyPress += (sender, e) => CleanupPythonProcesses();
@ -69,7 +76,6 @@ namespace FY2526_SW_PoC_APIRelay
break; break;
case FileType.Document: case FileType.Document:
fileContent = GetDataFromFile.ReadDocument(filePath); fileContent = GetDataFromFile.ReadDocument(filePath);
Console.WriteLine($"{fileContent}");
// 読み出し // 読み出し
break; break;
default: default:
@ -94,8 +100,23 @@ namespace FY2526_SW_PoC_APIRelay
return; return;
} }
// クレンジング後のテキストを保存
string outputFilePath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "cleansed_text.txt"); // 実行ファイルが存在するディレクトリを取得
try
{
File.WriteAllText(outputFilePath, cleansingText);
LogWriter.WriteLog($"クレンジング後のテキストを {outputFilePath} に保存しました", LogWriter.LogLevel.INFO);
}
catch (Exception ex)
{
System.Console.Write("{0}", (int)ResultCode.ErrorSavingFile);
LogWriter.WriteLog($"テキストの保存に失敗しました: {ex.Message} 結果コード:{(int)ResultCode.ErrorSavingFile}", LogWriter.LogLevel.ERROR);
return;
}
await SearchInVectorDB.DocumentVectorization(outputFilePath, filePath);
Console.Write($"{cleansingText}");
} }
catch (Exception ex) catch (Exception ex)
{ {