ベクトル化を追加、不要なファイルを削除
This commit is contained in:
parent
abd9fb7753
commit
d9e0ed5ebc
1
.gitignore
vendored
1
.gitignore
vendored
@ -361,3 +361,4 @@ MigrationBackup/
|
||||
|
||||
# Fody - auto-generated XML schema
|
||||
FodyWeavers.xsd
|
||||
/.lingma/rules/project_rule.md
|
||||
|
||||
241
FY2526-SW-PoC-APIRelay/DataProcess.cs
Normal file
241
FY2526-SW-PoC-APIRelay/DataProcess.cs
Normal file
@ -0,0 +1,241 @@
|
||||
using FY2526_SW_PoC_APIRelay;
|
||||
using Newtonsoft.Json;
|
||||
using System;
|
||||
using System.Collections;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.Linq;
|
||||
using System.Net.Http;
|
||||
using System.Numerics;
|
||||
using System.Reflection.Metadata;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Nodes;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
namespace FY2526_SW_PoC_APIRelay
|
||||
{
|
||||
public class SearchInVectorDB
|
||||
{
|
||||
private static readonly HttpClient httpClient = new HttpClient();
|
||||
private static readonly string qdrantUrl = "http://localhost:6333/collections/personalInformation/points";
|
||||
public static List<Process> pythonProcesses = new List<Process>();
|
||||
|
||||
/// <summary>メインエントリ:ドキュメントをベクトル化して、ベクトルデータベースに保存</summary>
|
||||
/// <param name="textFilePath">テキストファイルのパス</param>
|
||||
/// <param name="filePath">ファイルパス</param>
|
||||
/// <returns>結果コード</returns>
|
||||
public static async Task DocumentVectorization(string textFilePath, string filePath)
|
||||
{
|
||||
try
|
||||
{
|
||||
double[]? vector = await GenerateVector(textFilePath);
|
||||
if (vector == null)
|
||||
{
|
||||
LogWriter.WriteLog($"Python実行エラー発生", LogWriter.LogLevel.ERROR);
|
||||
Console.WriteLine($"{(int)ResultCode.PythonError}");
|
||||
return;
|
||||
}
|
||||
|
||||
string? content = GetTextFromFile(textFilePath);
|
||||
if (content is null)
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (await SaveToVectorDB(filePath, content, vector))
|
||||
{
|
||||
LogWriter.WriteLog($"ベクトルデータベースに保存成功", LogWriter.LogLevel.INFO);
|
||||
Console.WriteLine($"{(int)ResultCode.Success}");
|
||||
}
|
||||
else
|
||||
{
|
||||
LogWriter.WriteLog($"ベクトルデータベースに保存失敗", LogWriter.LogLevel.ERROR);
|
||||
Console.WriteLine($"{(int)ResultCode.Error}");
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
LogWriter.WriteLog($"エラー発生: {ex.Message}", LogWriter.LogLevel.ERROR);
|
||||
Console.WriteLine($"{(int)ResultCode.Error}");
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Vector DB への保存</summary>
|
||||
/// <param name="filePath">テキストファイルのパス</param>
|
||||
/// <param name="content">コンテンツ</param>
|
||||
/// <param name="vector">ベクトルデータ</param>
|
||||
/// <remarks>データベースにコンテンツとベクトルを保存</remarks>
|
||||
private static async Task<bool> SaveToVectorDB(string filePath, string content, double[] vector)
|
||||
{
|
||||
int newId = await GetNewId();
|
||||
if (newId == -1) return false;
|
||||
|
||||
var point = new
|
||||
{
|
||||
points = new[]
|
||||
{
|
||||
new
|
||||
{
|
||||
id = newId,
|
||||
payload = new { filePath, cluster = "/", indexID = "/", contents = content },
|
||||
vector
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
var json = JsonConvert.SerializeObject(point);
|
||||
var httpContent = new StringContent(json, Encoding.UTF8, "application/json");
|
||||
|
||||
try
|
||||
{
|
||||
var response = await httpClient.PutAsync(qdrantUrl, httpContent);
|
||||
string result = await response.Content.ReadAsStringAsync();
|
||||
return JsonNode.Parse(result)?["status"]?.ToString() == "ok";
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
LogWriter.WriteLog($"Qdrantデータ保存エラー: {ex.Message}", LogWriter.LogLevel.ERROR);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
///<summary>新ID取得</summary>
|
||||
///<returns>新ID</returns>
|
||||
///<remarks>ポイントの総件数を取得して、+1で返す</remarks>
|
||||
private static async Task<int> GetNewId()
|
||||
{
|
||||
try
|
||||
{
|
||||
var response = await httpClient.PostAsync($"{qdrantUrl}/count",
|
||||
new StringContent("{\"exact\": true}", Encoding.UTF8, "application/json"));
|
||||
response.EnsureSuccessStatusCode();
|
||||
|
||||
var jsonDoc = JsonDocument.Parse(await response.Content.ReadAsStringAsync());
|
||||
return jsonDoc.RootElement.GetProperty("result").GetProperty("count").GetInt32() + 1;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
LogWriter.WriteLog($"新ID取得エラー発生: {ex.Message}", LogWriter.LogLevel.ERROR);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>Pythonスクリプトを呼び出してベクトルを生成</summary>
|
||||
/// <param name="textFilePath">テキストファイルのパス</param>
|
||||
/// <returns>ベクトルデータ</returns>
|
||||
private static async Task<double[]?> GenerateVector(string textFilePath)
|
||||
{
|
||||
string? scriptPath;
|
||||
string? exeDirectory;
|
||||
string pythonPath = "python.exe";
|
||||
exeDirectory = AppContext.BaseDirectory;
|
||||
if (exeDirectory != null)
|
||||
{
|
||||
scriptPath = Path.Combine(exeDirectory, "Python", "FileVectorization.py");
|
||||
}
|
||||
else
|
||||
{
|
||||
LogWriter.WriteLog($"Pythonファイルのパスが間違っています", LogWriter.LogLevel.ERROR);
|
||||
return null;
|
||||
}
|
||||
|
||||
var start = new ProcessStartInfo
|
||||
{
|
||||
FileName = pythonPath,
|
||||
RedirectStandardOutput = true,
|
||||
RedirectStandardError = true,
|
||||
UseShellExecute = false,
|
||||
CreateNoWindow = true,
|
||||
StandardOutputEncoding = Encoding.UTF8,
|
||||
StandardErrorEncoding = Encoding.UTF8,
|
||||
};
|
||||
start.ArgumentList.Add(scriptPath);
|
||||
start.ArgumentList.Add(textFilePath);
|
||||
|
||||
try
|
||||
{
|
||||
using var process = Process.Start(start);
|
||||
if (process == null)
|
||||
{
|
||||
LogWriter.WriteLog("Pythonプロセスの開始に失敗しました", LogWriter.LogLevel.ERROR);
|
||||
return null;
|
||||
}
|
||||
string output = await process.StandardOutput.ReadToEndAsync();
|
||||
string error = await process.StandardError.ReadToEndAsync();
|
||||
|
||||
await process.WaitForExitAsync();
|
||||
|
||||
if (!string.IsNullOrEmpty(error))
|
||||
{
|
||||
LogWriter.WriteLog($"Python実行エラー: {error}", LogWriter.LogLevel.ERROR);
|
||||
return null;
|
||||
}
|
||||
LogWriter.WriteLog($"Python実行成功、ファイルベクトル化成功", LogWriter.LogLevel.INFO);
|
||||
using var documet = JsonDocument.Parse(output);
|
||||
var root = documet.RootElement;
|
||||
|
||||
string? value = root.TryGetProperty("value", out var valueElement) ? valueElement.GetString() : null;
|
||||
string? key = root.TryGetProperty("key", out var keyElement) ? keyElement.GetString() : null;
|
||||
Console.WriteLine($"{key}");
|
||||
|
||||
if (value != null)
|
||||
{
|
||||
return ConvertStringToDouble(value);
|
||||
}
|
||||
else
|
||||
{
|
||||
LogWriter.WriteLog($"Python実行エラー、PythonからのJSONデータ解析失敗", LogWriter.LogLevel.ERROR);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
LogWriter.WriteLog($"Pythonスクリプト実行エラー: {ex.Message}", LogWriter.LogLevel.ERROR);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>文字列をdouble型の配列に変換</summary>
|
||||
/// <param name="vectorString">テキストベクトルデータ</param>
|
||||
/// <remarks>文字列をdouble配列に変換</remarks>
|
||||
private static double[]? ConvertStringToDouble(string vectorString)
|
||||
{
|
||||
try
|
||||
{
|
||||
return vectorString
|
||||
.Trim('[', ']')
|
||||
.Split(',')
|
||||
.Select(s => double.Parse(s.Trim()))
|
||||
.ToArray();
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
LogWriter.WriteLog($"Python出力内容が異常、ベクトル変換エラー: {ex.Message}", LogWriter.LogLevel.ERROR);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>ファイルからテキストを抽出</summary>
|
||||
/// <param name="filePath">テキストファイルのパス</param>
|
||||
/// <returns>テキストのコンテンツ</returns>
|
||||
private static string? GetTextFromFile(string filePath)
|
||||
{
|
||||
try
|
||||
{
|
||||
return File.ReadAllText(filePath);
|
||||
}
|
||||
catch (FileNotFoundException ex)
|
||||
{
|
||||
LogWriter.WriteLog($"ファイル存在しない: {ex.Message}", LogWriter.LogLevel.ERROR);
|
||||
Console.WriteLine($"{(int)ResultCode.FileNotFound}");
|
||||
return null;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
LogWriter.WriteLog($"ファイル読み込みエラー: {ex.Message}", LogWriter.LogLevel.ERROR);
|
||||
Console.WriteLine($"{(int)ResultCode.ErrorReadFile}");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,29 +1,11 @@
|
||||
using ImageMagick;
|
||||
using InfoExtraction;
|
||||
using Microsoft.Office.Interop.Excel;
|
||||
using Microsoft.Office.Interop.Excel;
|
||||
using Microsoft.Office.Interop.Excel;
|
||||
using Microsoft.Office.Interop.PowerPoint;
|
||||
using Microsoft.Office.Interop.PowerPoint;
|
||||
using Microsoft.Office.Interop.PowerPoint;
|
||||
using Microsoft.Office.Interop.Word;
|
||||
using Microsoft.Office.Interop.Word;
|
||||
using Microsoft.Office.Interop.Word;
|
||||
using PdfiumViewer;
|
||||
using PdfiumViewer;
|
||||
using PretreatmentFile;
|
||||
using System;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Runtime.Versioning;
|
||||
using System.Text;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
using System.Threading.Tasks;
|
||||
using Windows.ApplicationModel.Core;
|
||||
using Windows.Graphics.Imaging;
|
||||
using Windows.Media.Ocr;
|
||||
using Windows.Storage;
|
||||
|
||||
@ -1,123 +0,0 @@
|
||||
using FY2526_SW_PoC_APIRelay;
|
||||
using ImageMagick;
|
||||
using PretreatmentFile;
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Threading.Tasks;
|
||||
using Windows.Graphics.Imaging;
|
||||
using Windows.Media.Ocr;
|
||||
using Windows.Storage;
|
||||
using Windows.Storage.Streams;
|
||||
|
||||
namespace PretreatmentFile
|
||||
{
|
||||
public class OcrHelper
|
||||
{
|
||||
/// <summary>OCR</summary>
|
||||
/// <param name="filePath">ファイルパス</param>
|
||||
/// <returns>読み取った文字列</returns>
|
||||
///
|
||||
public static async Task<string?> Ocr(string filePath)
|
||||
{
|
||||
try
|
||||
{
|
||||
string extension = Path.GetExtension(filePath).ToLower();
|
||||
string? convertedFilePath = null;
|
||||
|
||||
if (extension == ".heic")
|
||||
{
|
||||
try
|
||||
{
|
||||
convertedFilePath = ConvertHeicToJpeg(filePath);
|
||||
if (convertedFilePath == null)
|
||||
{
|
||||
LogWriter.WriteLog("HEICファイルの変換に失敗しました", LogWriter.LogLevel.ERROR);
|
||||
return null;
|
||||
}
|
||||
filePath = convertedFilePath;
|
||||
}
|
||||
catch (Exception conversionException)
|
||||
{
|
||||
LogWriter.WriteLog($"HEICからJPEGへの変換中にエラーが発生しました: {conversionException.Message}", LogWriter.LogLevel.ERROR);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
// ファイルを開き、BitmapDecoderを作成
|
||||
var file = await StorageFile.GetFileFromPathAsync(filePath);
|
||||
var stream = await file.OpenAsync(FileAccessMode.Read);
|
||||
var decoder = await BitmapDecoder.CreateAsync(stream);
|
||||
var bmp = await decoder.GetSoftwareBitmapAsync();
|
||||
|
||||
try
|
||||
{
|
||||
var engine = OcrEngine.TryCreateFromLanguage(new Windows.Globalization.Language("ja"));
|
||||
if (engine != null)
|
||||
{
|
||||
var result = await engine.RecognizeAsync(bmp);
|
||||
|
||||
// OcrResult.Lines から各行のテキストを取得し、改行で結合
|
||||
var extractedText = string.Join(Environment.NewLine, result.Lines.Select(line => line.Text));
|
||||
|
||||
if (convertedFilePath != null && File.Exists(convertedFilePath))
|
||||
{
|
||||
try
|
||||
{
|
||||
File.Delete(convertedFilePath); // JPEGファイルを削除
|
||||
}
|
||||
catch (Exception deleteException)
|
||||
{
|
||||
LogWriter.WriteLog($"JPEGファイルの削除に失敗しました: {deleteException.Message}", LogWriter.LogLevel.ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
return extractedText;
|
||||
}
|
||||
else
|
||||
{
|
||||
LogWriter.WriteLog("OCRエンジンの作成に失敗しました", LogWriter.LogLevel.ERROR);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
catch (Exception ocrException)
|
||||
{
|
||||
LogWriter.WriteLog($"OCR認識に失敗しました: {ocrException.Message}", LogWriter.LogLevel.ERROR);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
LogWriter.WriteLog($"予期しないエラーが発生しました: {e.Message}", LogWriter.LogLevel.ERROR);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// <summary>heicをjpegに変換する</summary>
|
||||
/// <param name="heicFilePath">heicファイルのパス</param>
|
||||
/// <returns>変換後のjpegファイルのパス</returns>
|
||||
///
|
||||
private static string? ConvertHeicToJpeg(string heicFilePath)
|
||||
{
|
||||
try
|
||||
{
|
||||
// Magick.NETを使用してHEICファイルをJPEGに変換
|
||||
string outputFilePath = heicFilePath.Replace(".heic", ".jpg", StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
using (var image = new MagickImage(heicFilePath))
|
||||
{
|
||||
// JPEGとして保存
|
||||
image.Format = MagickFormat.Jpeg;
|
||||
image.Write(outputFilePath);
|
||||
}
|
||||
|
||||
return outputFilePath;
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
LogWriter.WriteLog($"HEICファイルの変換中にエラーが発生しました: {e.Message}", LogWriter.LogLevel.ERROR);
|
||||
return null; // 変換に失敗した場合はnullを返す
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -2,28 +2,35 @@
|
||||
|
||||
namespace FY2526_SW_PoC_APIRelay
|
||||
{
|
||||
|
||||
/// <summary>結果コード</summary>
|
||||
public enum ResultCode : int
|
||||
{
|
||||
Success = 2000,
|
||||
BadParameter = 4000,
|
||||
FileNotFound = 4004,
|
||||
NotTarget = 4005,
|
||||
Error = 5000,
|
||||
ErrorReadFile = 5001,
|
||||
ErrorCleansing = 5002,
|
||||
ErrorSavingFile = 5003,
|
||||
FileAlreadyExists = 6001,
|
||||
FileNotSupported = 6002,
|
||||
PythonError = 7001,
|
||||
ModelLoadFailed = 7002,
|
||||
ModelLoadSuccess = 7003,
|
||||
PythonEndSuccess = 7004,
|
||||
PythonInputError = 7005,
|
||||
QdrantNoData = 8001,
|
||||
QdrantError = 8002
|
||||
}
|
||||
internal class Program
|
||||
{
|
||||
private enum ResultCode : int
|
||||
{
|
||||
Success = 2000,
|
||||
BadParameter = 4000,
|
||||
FileNotFound = 4004,
|
||||
NotTarget = 4005,
|
||||
Error = 5000,
|
||||
ErrorReadFile = 5001,
|
||||
ErrorCleansing = 5002,
|
||||
FileAlreadyExists = 6001,
|
||||
FileNotSupported = 6002
|
||||
}
|
||||
|
||||
/// <summary>メイン関数</summary>
|
||||
/// <param name="args">実行引数</param>
|
||||
[STAThread]
|
||||
static async Task Main(string[] args)
|
||||
{
|
||||
LogWriter.WriteLog("11ファイル前処理 開始", LogWriter.LogLevel.INFO);
|
||||
LogWriter.WriteLog("ファイル前処理 開始", LogWriter.LogLevel.INFO);
|
||||
AppDomain.CurrentDomain.ProcessExit += (s, e) => CleanupPythonProcesses();
|
||||
AppDomain.CurrentDomain.UnhandledException += (s, e) => CleanupPythonProcesses();
|
||||
Console.CancelKeyPress += (sender, e) => CleanupPythonProcesses();
|
||||
@ -69,7 +76,6 @@ namespace FY2526_SW_PoC_APIRelay
|
||||
break;
|
||||
case FileType.Document:
|
||||
fileContent = GetDataFromFile.ReadDocument(filePath);
|
||||
Console.WriteLine($"{fileContent}");
|
||||
// 読み出し
|
||||
break;
|
||||
default:
|
||||
@ -94,8 +100,23 @@ namespace FY2526_SW_PoC_APIRelay
|
||||
return;
|
||||
}
|
||||
|
||||
// クレンジング後のテキストを保存
|
||||
string outputFilePath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "cleansed_text.txt"); // 実行ファイルが存在するディレクトリを取得
|
||||
try
|
||||
{
|
||||
File.WriteAllText(outputFilePath, cleansingText);
|
||||
LogWriter.WriteLog($"クレンジング後のテキストを {outputFilePath} に保存しました", LogWriter.LogLevel.INFO);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
System.Console.Write("{0}", (int)ResultCode.ErrorSavingFile);
|
||||
LogWriter.WriteLog($"テキストの保存に失敗しました: {ex.Message} 結果コード:{(int)ResultCode.ErrorSavingFile}", LogWriter.LogLevel.ERROR);
|
||||
return;
|
||||
}
|
||||
|
||||
await SearchInVectorDB.DocumentVectorization(outputFilePath, filePath);
|
||||
|
||||
|
||||
Console.Write($"{cleansingText}");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user