diff --git a/FY2526-SW-PoC-APIRelay/ComWrapper.cs b/FY2526-SW-PoC-APIRelay/ComWrapper.cs new file mode 100644 index 0000000..59b076c --- /dev/null +++ b/FY2526-SW-PoC-APIRelay/ComWrapper.cs @@ -0,0 +1,44 @@ +using System.Runtime.InteropServices; +using System.Runtime.Versioning; + +namespace InfoExtraction +{ + [SupportedOSPlatform("windows")] + class ComWrapper : IDisposable + { + public T ComObject { get; } + + public ComWrapper(T comObject) + { + this.ComObject = comObject; + } + + private bool disposedValue = false; + + protected virtual void Dispose(bool disposing) + { + if (!disposedValue) + { + if (disposing) + { + //nop + } + if (ComObject != null) + { + Marshal.ReleaseComObject(ComObject); + } + disposedValue = true; + } + } + + ~ComWrapper() + { + Dispose(false); + } + + public void Dispose() + { + Dispose(true); + } + } +} diff --git a/FY2526-SW-PoC-APIRelay/FY2526-SW-PoC-APIRelay.csproj b/FY2526-SW-PoC-APIRelay/FY2526-SW-PoC-APIRelay.csproj index e098a36..88a0ed1 100644 --- a/FY2526-SW-PoC-APIRelay/FY2526-SW-PoC-APIRelay.csproj +++ b/FY2526-SW-PoC-APIRelay/FY2526-SW-PoC-APIRelay.csproj @@ -1,16 +1,60 @@  - - Exe - net8.0 - FY2526_SW_PoC_APIRelay - enable - enable - + + Exe + net8.0-windows10.0.22000.0 + enable + enable + - - - + + + tlbimp + 8 + 2 + 2df8d04c-5bfa-101b-bde5-00aa0044de52 + 0 + false + true + + + tlbimp + 9 + 1 + 00020813-0000-0000-c000-000000000046 + 0 + false + true + + + tlbimp + 7 + 8 + 00020905-0000-0000-c000-000000000046 + 0 + false + true + + + tlbimp + 12 + 2 + 91493440-5a91-11cf-8700-00aa0060263b + 0 + false + true + + + + + + + + + + + + diff --git a/FY2526-SW-PoC-APIRelay/FileDiscrimination.cs b/FY2526-SW-PoC-APIRelay/FileDiscrimination.cs new file mode 100644 index 0000000..6db480c --- /dev/null +++ b/FY2526-SW-PoC-APIRelay/FileDiscrimination.cs @@ -0,0 +1,65 @@ +using Microsoft.VisualBasic.FileIO; +using Newtonsoft.Json; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Net.Http; +using System.Text; +using System.Text.Json.Nodes; +using System.Threading.Tasks; +using static FY2526_SW_PoC_APIRelay.LogWriter; + +namespace FY2526_SW_PoC_APIRelay +{ + internal class FileDiscrimination + { + + private static readonly HttpClient httpClient = new HttpClient + { + Timeout = TimeSpan.FromSeconds(30) + }; + private static readonly string qdrantUrl = "http://localhost:6333/collections/personalInformation/points"; + + public static FileType GetFileType(string filePath) + { + string extension = Path.GetExtension(filePath).ToLower(); + + return extension switch + { + ".jpeg" or ".jpg" or ".png" or ".gif" or ".heic" => FileType.Image, + ".txt" or ".pdf" or ".doc" or ".docx" or ".xls" or ".xlsx" or ".ppt" or ".pptx" => FileType.Document, + _ => FileType.Other, + }; + } + + public static async Task FileExistCheck(string filePath) + { + var requestData = new + { + filter = new + { + must = new[] { new { key = "filePath", match = new { value = filePath } } } + }, + limit = 1 + }; + + var json = JsonConvert.SerializeObject(requestData); + var httpContent = new StringContent(json, Encoding.UTF8, "application/json"); + + try + { + var response = await httpClient.PostAsync($"{qdrantUrl}/scroll", httpContent); + response.EnsureSuccessStatusCode(); + + string responseString = await response.Content.ReadAsStringAsync(); + var jsonObject = JsonNode.Parse(responseString); + return jsonObject?["result"]?["points"]?.AsArray()?.Count > 0; + } + catch (Exception ex) + { + LogWriter.WriteLog($"ファイル確認エラー: {ex.Message}", LogWriter.LogLevel.ERROR); + throw new Exception("データベースにデータ取得エラー。"); + } + } + } +} diff --git a/FY2526-SW-PoC-APIRelay/FileType.cs b/FY2526-SW-PoC-APIRelay/FileType.cs new file mode 100644 index 0000000..59b33b7 --- /dev/null +++ b/FY2526-SW-PoC-APIRelay/FileType.cs @@ -0,0 +1,12 @@ +namespace FY2526_SW_PoC_APIRelay +{ + enum FileType + { + /// 画像 + Image, + /// ドキュメントファイル + Document, + /// その他 + Other + } +} \ No newline at end of file diff --git a/FY2526-SW-PoC-APIRelay/GetDataFromFile.cs b/FY2526-SW-PoC-APIRelay/GetDataFromFile.cs new file mode 100644 index 0000000..12c1247 --- /dev/null +++ b/FY2526-SW-PoC-APIRelay/GetDataFromFile.cs @@ -0,0 +1,1150 @@ +using ImageMagick; +using InfoExtraction; +using Microsoft.Office.Interop.Excel; +using Microsoft.Office.Interop.Excel; +using Microsoft.Office.Interop.Excel; +using Microsoft.Office.Interop.PowerPoint; +using Microsoft.Office.Interop.PowerPoint; +using Microsoft.Office.Interop.PowerPoint; +using Microsoft.Office.Interop.Word; +using Microsoft.Office.Interop.Word; +using Microsoft.Office.Interop.Word; +using PdfiumViewer; +using PdfiumViewer; +using PretreatmentFile; +using System; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Runtime.InteropServices; +using System.Runtime.Versioning; +using System.Text; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading.Tasks; +using Windows.ApplicationModel.Core; +using Windows.Graphics.Imaging; +using Windows.Media.Ocr; +using Windows.Storage; +using ExcelApplication = Microsoft.Office.Interop.Excel.Application; +using PowerPointApplication = Microsoft.Office.Interop.PowerPoint.Application; +using WordApplication = Microsoft.Office.Interop.Word.Application; + +namespace FY2526_SW_PoC_APIRelay +{ + internal class GetDataFromFile + { + /// OCR + /// ファイルパス + /// 読み取った文字列 + /// + public static async Task Ocr(string filePath) + { + try + { + string extension = Path.GetExtension(filePath).ToLower(); + string? convertedFilePath = null; + + if (extension == ".heic") + { + try + { + convertedFilePath = ConvertHeicToJpeg(filePath); + if (convertedFilePath == null) + { + LogWriter.WriteLog("HEICファイルの変換に失敗しました", LogWriter.LogLevel.ERROR); + return null; + } + filePath = convertedFilePath; + } + catch (Exception conversionException) + { + LogWriter.WriteLog($"HEICからJPEGへの変換中にエラーが発生しました: {conversionException.Message}", LogWriter.LogLevel.ERROR); + return null; + } + } + + // ファイルを開き、BitmapDecoderを作成 + var file = await StorageFile.GetFileFromPathAsync(filePath); + var stream = await file.OpenAsync(FileAccessMode.Read); + var decoder = await BitmapDecoder.CreateAsync(stream); + var bmp = await decoder.GetSoftwareBitmapAsync(); + + try + { + var engine = OcrEngine.TryCreateFromLanguage(new Windows.Globalization.Language("ja")); + if (engine != null) + { + var result = await engine.RecognizeAsync(bmp); + + // OcrResult.Lines から各行のテキストを取得し、改行で結合 + var extractedText = string.Join(Environment.NewLine, result.Lines.Select(line => line.Text)); + + if (convertedFilePath != null && File.Exists(convertedFilePath)) + { + try + { + File.Delete(convertedFilePath); // JPEGファイルを削除 + } + catch (Exception deleteException) + { + LogWriter.WriteLog($"JPEGファイルの削除に失敗しました: {deleteException.Message}", LogWriter.LogLevel.ERROR); + } + } + + return extractedText; + } + else + { + LogWriter.WriteLog("OCRエンジンの作成に失敗しました", LogWriter.LogLevel.ERROR); + return null; + } + } + catch (Exception ocrException) + { + LogWriter.WriteLog($"OCR認識に失敗しました: {ocrException.Message}", LogWriter.LogLevel.ERROR); + return null; + } + } + catch (Exception e) + { + LogWriter.WriteLog($"予期しないエラーが発生しました: {e.Message}", LogWriter.LogLevel.ERROR); + return null; + } + } + + /// heicをjpegに変換する + /// heicファイルのパス + /// 変換後のjpegファイルのパス + /// + private static string? ConvertHeicToJpeg(string heicFilePath) + { + try + { + // Magick.NETを使用してHEICファイルをJPEGに変換 + string outputFilePath = heicFilePath.Replace(".heic", ".jpg", StringComparison.OrdinalIgnoreCase); + + using (var image = new MagickImage(heicFilePath)) + { + // JPEGとして保存 + image.Format = MagickFormat.Jpeg; + image.Write(outputFilePath); + } + + return outputFilePath; + } + catch (Exception e) + { + LogWriter.WriteLog($"HEICファイルの変換中にエラーが発生しました: {e.Message}", LogWriter.LogLevel.ERROR); + return null; // 変換に失敗した場合はnullを返す + } + } + + public static string? ReadDocument(string filePath) + { + try + { + Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); + string extension = Path.GetExtension(filePath).ToLower(); + string extractedText = string.Empty; + string? result = null; + + switch (Path.GetExtension(filePath.ToLower())) + { + case ".txt": + byte[] firstData = new byte[1000]; + //ゼロサイズのutf-16LE.BE 対応 + firstData[2] = 0xFF; + firstData[3] = 0xFF; + int readCount; + using (FileStream fs = new(filePath, FileMode.Open)) + { + readCount = fs.Read(firstData, 0, 1000); + } + + using (StreamReader sr = new(filePath, GetEncoding(firstData, readCount))) + { + result = sr.ReadToEnd(); + } + break; + case ".pdf": + using (PdfDocument doc = PdfDocument.Load(filePath)) + { + for (var pageNum = 0; pageNum < doc.PageCount; pageNum++) + { + result += doc.GetPdfText(pageNum); + } + } + break; + case ".docx": + case ".doc": + using (var word = new ComWrapper(new WordApplication() { Visible = false, DisplayAlerts = WdAlertLevel.wdAlertsNone })) + using (var docs = new ComWrapper(word.ComObject.Documents)) + { + using var doc = new ComWrapper(docs.ComObject.Open(filePath, + ReadOnly: true, + AddToRecentFiles: false, + Visible: false)); + var tempFiles = new string[2]; + + bool success = false; + try + { + tempFiles[0] = Path.GetTempFileName(); + tempFiles[1] = Path.GetTempFileName(); + + // Wordドキュメントの全テキストを取得 + string fullText = doc.ComObject.Content.Text; + + // テキストを一時ファイルに保存 + File.WriteAllText(tempFiles[0], fullText, Encoding.UTF8); + + // テキストボックス内のテキストやコメントも抽出 + List otherContents = new List(); + + foreach (Microsoft.Office.Interop.Word.Shape shape in doc.ComObject.Shapes) + { + // 図形のテキスト抽出 + ExtractShapeContents(otherContents, shape); + } + foreach (Microsoft.Office.Interop.Word.Comment comment in doc.ComObject.Comments) + { + // コメントの保存 + otherContents.Add(comment.Author + ":" + comment.Range.Text); + } + + // 他のテキストを一時ファイルに保存 + File.WriteAllLines(tempFiles[1], otherContents, Encoding.GetEncoding("UTF-8")); + + success = true; + } + catch + { + throw; + } + finally + { + doc.ComObject.Close(false); + if (success) + { + // マージ + result = MergeTextContents(tempFiles); + } + DeleteFiles(tempFiles); + } + } + break; + case ".xlsx": + case ".xls": + using (var excel = new ComWrapper(new ExcelApplication() { Visible = false, DisplayAlerts = false })) + using (var books = new ComWrapper(excel.ComObject.Workbooks)) + { + using var book = new ComWrapper(books.ComObject.Open(filePath, + UpdateLinks: XlUpdateLinks.xlUpdateLinksNever, + ReadOnly: true, + IgnoreReadOnlyRecommended: true, + Editable: false)); + + List contents = new(); + List tempFiles = new(); + + bool success = false; + try + { + for (int i = 1; i <= book.ComObject.Worksheets.Count; i++) + { + using var sheet = new ComWrapper(book.ComObject.Worksheets[i]); + + var tempFile1 = Path.GetTempFileName(); + tempFiles.Add(tempFile1); + + // シート内のテキストを抽出 + List sheetContents = new(); + foreach (Microsoft.Office.Interop.Excel.Range cell in sheet.ComObject.UsedRange) + { + // セルの内容を追加 + sheetContents.Add(cell.Text.ToString()); + } + + // シート内のテキストを一時ファイルに保存 + File.WriteAllLines(tempFile1, sheetContents, Encoding.GetEncoding("UTF-8")); + + // 図形内のテキスト抽出 + List otherContents = new(); + foreach (Microsoft.Office.Interop.Excel.Shape shape in sheet.ComObject.Shapes) + { + // 図形のテキスト抽出 + ExtractShapesContents(otherContents, shape); + } + + // コメントの抽出 + foreach (Microsoft.Office.Interop.Excel.CommentThreaded comment in sheet.ComObject.CommentsThreaded) + { + // コメントのテキストを追加 + otherContents.Add(comment.Author.Name + ":" + comment.Text()); + } + + // メモの抽出 + foreach (Microsoft.Office.Interop.Excel.Comment memo in sheet.ComObject.Comments) + { + // メモのテキストを追加 + otherContents.Add(memo.Author + ":" + memo.Text()); + } + + var tempFile2 = Path.GetTempFileName(); + tempFiles.Add(tempFile2); + File.WriteAllLines(tempFile2, otherContents, Encoding.GetEncoding("UTF-8")); + } + + success = true; + } + catch + { + throw; + } + finally + { + book.ComObject.Close(false); + if (success) + { + // 一時ファイルから内容をマージ + foreach (var tempFile in tempFiles) + { + var sheetContents = MergeTextContents(new string[] { tempFile }); + contents.Add(sheetContents); + File.Delete(tempFile); + } + + // 結果として、全ての内容を結合 + result = string.Join(string.Empty, contents); + } + + // 一時ファイルの削除 + DeleteFiles(tempFiles.ToArray()); + } + } + break; + case ".pptx": + case ".ppt": + using (var powerPoint = new ComWrapper(new PowerPointApplication() { DisplayAlerts = PpAlertLevel.ppAlertsNone })) + using (var ppts = new ComWrapper(powerPoint.ComObject.Presentations)) + { + using var ppt = new ComWrapper(ppts.ComObject.Open(filePath, + ReadOnly: Microsoft.Office.Core.MsoTriState.msoTrue, + WithWindow: Microsoft.Office.Core.MsoTriState.msoFalse)); + + var contents = new List(); + var tempFiles = new string[1]; + var success = false; + try + { + tempFiles[0] = Path.GetTempFileName(); + + // 図形のテキストとコメントの抽出 + var slideContents = new List(); + foreach (Slide slide in ppt.ComObject.Slides) + { + foreach (Microsoft.Office.Interop.PowerPoint.Shape shape in slide.Shapes) + { + ExtractShapeContents(slideContents, shape); + } + foreach (Microsoft.Office.Interop.PowerPoint.Comment comment in slide.Comments) + { + slideContents.Add(comment.Author + ":" + comment.Text); + } + slideContents.Add(slide.NotesPage.Shapes.Placeholders[2].TextFrame.TextRange.Text);//placefolders[1] is slide itself. + } + File.WriteAllLines(tempFiles[0], slideContents, Encoding.GetEncoding("UTF-8")); + success = true; + } + finally + { + ppt.ComObject.Close(); + if (success) + { + result = MergeTextContents(tempFiles); + } + DeleteFiles(tempFiles); + } + break; + } + default: + throw new ArgumentException("ドキュメントファイルではありません", nameof(filePath)); + } + return result; + } + catch (Exception ex) + { + System.Console.Write(ex.Message); + System.Console.Write(ex.StackTrace); + LogWriter.WriteLog($"読み出しでエラー発生: {ex.Message}", LogWriter.LogLevel.ERROR); + throw; + } + } + + private static void ExtractShapeContents(List contents, Microsoft.Office.Interop.Word.Shape shape) + { + shape.Select(); + if (shape.Type == Microsoft.Office.Core.MsoShapeType.msoGroup) + { + foreach (Microsoft.Office.Interop.Word.Shape subShape in shape.GroupItems) + { + // グループ内図形に対して再帰呼び出し + ExtractShapeContents(contents, subShape); + } + } + else if (shape.Type == Microsoft.Office.Core.MsoShapeType.msoCanvas) + { + foreach (Microsoft.Office.Interop.Word.Shape subShape in shape.CanvasItems) + { + // キャンバス内図形に対して再帰呼び出し + ExtractShapeContents(contents, subShape); + } + } + else + { + if (shape.TextFrame != null && shape.TextFrame.HasText != 0) + { + // 図形内テキストの保存 + var text = shape.TextFrame?.TextRange?.Text; + if (!string.IsNullOrEmpty(text)) + { + contents.Add(text); + } + } + } + } + + private static void ExtractShapesContents(List contents, Microsoft.Office.Interop.Excel.Shape shape) + { + if (shape.Type == Microsoft.Office.Core.MsoShapeType.msoGroup) + { + foreach (Microsoft.Office.Interop.Excel.Shape subShape in shape.GroupItems) + { + ExtractShapesContents(contents, subShape); + } + } + else if (shape.Type == Microsoft.Office.Core.MsoShapeType.msoCanvas) + { + foreach (Microsoft.Office.Interop.Excel.Shape subShape in shape.CanvasItems) + { + ExtractShapesContents(contents, subShape); + } + } + else + { + try + { + var text = shape.TextEffect?.Text; + if (!string.IsNullOrEmpty(text)) + { + contents.Add(text); + } + } + catch + { + } + } + } + + private static void ExtractShapeContents(List contents, Microsoft.Office.Interop.PowerPoint.Shape shape) + { + if (shape.Type == Microsoft.Office.Core.MsoShapeType.msoGroup) + { + foreach (Microsoft.Office.Interop.PowerPoint.Shape subShape in shape.GroupItems) + { + ExtractShapeContents(contents, subShape); + } + } + else if (shape.Type == Microsoft.Office.Core.MsoShapeType.msoCanvas) + { + foreach (Microsoft.Office.Interop.PowerPoint.Shape subShape in shape.CanvasItems) + { + ExtractShapeContents(contents, subShape); + } + } + else + { + if (shape.TextFrame != null && shape.TextFrame.HasText == Microsoft.Office.Core.MsoTriState.msoTrue) + { + var text = shape.TextFrame?.TextRange?.Text; + if (!string.IsNullOrEmpty(text)) + { + contents.Add(text); + } + } + } + } + + private static string MergeTextContents(string[] tempFiles) + { + var contents = new List(); + foreach (var tempFile in tempFiles) + { + if (File.Exists(tempFile)) + { + contents.AddRange(File.ReadAllLines(tempFile, Encoding.GetEncoding("Shift_JIS"))); + } + } + return string.Join("", contents); + } + + private static void DeleteFiles(string[] tempFiles) + { + foreach (var tempFile in tempFiles) + { + if (File.Exists(tempFile)) + { + File.Delete(tempFile); + } + } + } + + private static Encoding GetEncoding(byte[] firstData, int dataLength) + { + if (dataLength < 2) + { + return Encoding.GetEncoding("Shift_JIS"); + } + + if ((firstData[0] == 0xfe) && (firstData[1] == 0xff)) + { + // UTF-16 BE + return new UnicodeEncoding(true, true); + } + + if ((firstData[0] == 0xff) && (firstData[1] == 0xfe)) + { + if ((4 <= firstData.Length) && + (firstData[2] == 0x00) && (firstData[3] == 0x00)) + { + // UTF-32 LE + return new UTF32Encoding(false, true); + } + // UTF-16 LE + return new UnicodeEncoding(false, true); + } + + if (dataLength < 3) + { + return Encoding.GetEncoding("Shift_JIS"); + } + + if ((firstData[0] == 0xef) && (firstData[1] == 0xbb) && (firstData[2] == 0xbf)) + { + //UTF-8 + return new UTF8Encoding(true, true); + } + + if (dataLength < 4) + { + return Encoding.GetEncoding("Shift_JIS"); + } + + if ((firstData[0] == 0x00) && (firstData[1] == 0x00) && + (firstData[2] == 0xfe) && (firstData[3] == 0xff)) + { + // UTF-32 BE + return new UTF32Encoding(true, true); + } + + // BOMなし + var ret = JISEncodingJudgment(firstData, dataLength); + if (ret.Item1 == false) + { + if (ret.Item2) + { + return Encoding.GetEncoding("iso-2022-jp"); + } + else + { + return Encoding.GetEncoding("us-ascii"); + } + } + + var outOfSpecification = Utf8EncodingJudgment(firstData, dataLength); + if (outOfSpecification == false) + { + //UTF-8 + return new UTF8Encoding(true, true); + } + + outOfSpecification = EUCJPEncodingJudgment(firstData, dataLength); + if (outOfSpecification == false) + { + return Encoding.GetEncoding("EUC-JP"); + } + + return Encoding.GetEncoding("Shift_JIS"); + } + + private static bool IsMatched(byte[] data, byte[] bom) + { + bool result = true; + + for (int i = 0; i < bom.Length; i++) + { + if (bom[i] != data[i]) + { + result = false; + break; + } + } + + return result; + } + + private static (bool, bool) JISEncodingJudgment(byte[] buffer, int sizeOfBuffer) + { + bool result = false; + bool esc1 = false; + bool esc2 = false; + byte[] byteESC1 = { 0x1B, 0x28, 0x42 }; + byte[] byteESC2 = { 0x1B, 0x24, 0x42 }; + byte[] backESC = { 0, 0, 0 }; + + for (int i = 0; i < sizeOfBuffer; i++) + { + if (0x80 <= buffer[i]) + { + result = true; + break; + } + else + { + backESC[0] = backESC[1]; + backESC[1] = backESC[2]; + backESC[2] = buffer[i]; + if (esc1 == false && IsMatched(backESC, byteESC1)) + { + esc1 = true; + } + if (esc2 == false && IsMatched(backESC, byteESC2)) + { + esc2 = true; + } + } + } + + return (result, esc1 || esc2); + } + + private static bool Utf8EncodingJudgment(byte[] buffer, int sizeOfBuffer) + { + bool outOfSpecification; + + outOfSpecification = false; + uint[] byteChar = new uint[6]; + int byteCharCount = 0; + + for (int i = 0; i < sizeOfBuffer; i++) + { + //2バイト文字以上である + if (0x80 <= buffer[i]) + { + //2バイト文字 + uint char2byte = (uint)0b11100000 & buffer[i]; + if (char2byte == 0b11000000) + { + //セカンドコード数が規格より少なければ規格外 + outOfSpecification = Utf8OutOfSpecification(byteChar[0], byteCharCount, false); + if (outOfSpecification) + { + break; + } + + byteChar[0] = char2byte; + byteCharCount = 1; + continue; + } + + //3バイト文字 + uint char3byte = (uint)0b11110000 & (uint)buffer[i]; + if (char3byte == 0b11100000) + { + //セカンドコード数が規格より少なければ規格外 + outOfSpecification = Utf8OutOfSpecification(byteChar[0], byteCharCount, false); + if (outOfSpecification) + { + break; + } + + byteChar[0] = char3byte; + byteCharCount = 1; + continue; + } + + //4バイト文字 + uint char4byte = (uint)0b11111000 & (uint)buffer[i]; + if (char4byte == 0b11110000) + { + //セカンドコード数が規格より少なければ規格外 + outOfSpecification = Utf8OutOfSpecification(byteChar[0], byteCharCount, false); + if (outOfSpecification) + { + break; + } + + byteChar[0] = char4byte; + byteCharCount = 1; + continue; + } + + //2バイト目以降のコード + uint charSecond = (uint)0b11000000 & (uint)buffer[i]; + if (charSecond == 0b10000000) + { + // 文字の先頭がセカンドコードなら規格外 + if (byteCharCount < 1) + { + outOfSpecification = true; + break; + } + + //セカンドコードを保存 + byteChar[byteCharCount] = charSecond; + byteCharCount++; + + //セカンドコード数が規格より多ければ規格外 + outOfSpecification = Utf8OutOfSpecification(byteChar[0], byteCharCount, true); + if (outOfSpecification) + { + break; + } + + continue; + } + + //どれにも当てはまらない + outOfSpecification = true; + break; + } + else + { + // 7bit文字 + byteChar[0] = 0; + byteCharCount = 0; + } + } + + return outOfSpecification; + } + + private static bool Utf8OutOfSpecification(uint topByteChar, int byteCharCount, bool checkBig) + { + bool outOfSpecification = false; + + //セカンドコード数が規格より多ければ規格外 + if (topByteChar == 0b11000000) + { + if (checkBig == true) + { + if (byteCharCount > 2) outOfSpecification = true; + } + else + { + if (byteCharCount < 2) outOfSpecification = true; + } + } + else if (topByteChar == 0b11100000) + { + if (checkBig == true) + { + if (byteCharCount > 3) outOfSpecification = true; + } + else + { + if (byteCharCount < 3) outOfSpecification = true; + } + } + else if (topByteChar == 0b11110000) + { + if (checkBig == true) + { + if (byteCharCount > 4) outOfSpecification = true; + } + else + { + if (byteCharCount < 4) outOfSpecification = true; + } + } + + return outOfSpecification; + } + + private enum BYTECODE : byte { OneByteCode, TwoByteCode, KanaOneByte } + + private static bool EUCJPEncodingJudgment(byte[] buffer, int sizeOfBuffer) + { + bool outOfSpecification = false; + + BYTECODE beforeCode = BYTECODE.OneByteCode; + int byteCharCount = 0; + + for (int i = 0; i < sizeOfBuffer; i++) + { + // 2バイトコード + if (0xA1 <= buffer[i] && buffer[i] <= 0xFE) + { + if (beforeCode == BYTECODE.KanaOneByte) + { + if (byteCharCount == 1) + { + byteCharCount = 2; + } + else + { + outOfSpecification = true; + break; + } + } + + if (beforeCode == BYTECODE.TwoByteCode) + { + if (byteCharCount == 1) + byteCharCount = 2; + else if (byteCharCount == 2) + byteCharCount = 1; + } + + beforeCode = BYTECODE.TwoByteCode; + } + // 1バイトコード + else if (buffer[i] <= 0x7F) + { + if (beforeCode == BYTECODE.TwoByteCode && byteCharCount == 1) + { + outOfSpecification = true; + break; + } + + beforeCode = BYTECODE.OneByteCode; + byteCharCount = 1; + } + // 半角カタカナ2バイトコード + else if (buffer[i] == 0x8E && byteCharCount == 1) + { + beforeCode = BYTECODE.KanaOneByte; + byteCharCount = 1; + } + // あり得ない + else + { + outOfSpecification = true; + break; + } + } + + return outOfSpecification; + } + + /// クレンジング + /// 文字列 + /// csvファイルのパス + /// 特殊記号などを除去した文字列 + /// + public static string? CleansingText(string fileText, string csvFilePath) + { + string? result = null; + + try + { + // CSVファイルから除外する特殊記号を読み込む + System.Text.Encoding.RegisterProvider(System.Text.CodePagesEncodingProvider.Instance); + List specialCharacters = LoadSpecialCharacters(csvFilePath); + + if (specialCharacters.Count > 0) + { + // 特殊記号を1つずつ削除 + foreach (var specialChar in specialCharacters) + { + if (!string.IsNullOrEmpty(specialChar)) + { + fileText = fileText.Replace(specialChar, string.Empty); + } + } + // 文字の統一処理(数字の半角化、アルファベット小文字化、ひらがなカタカナ漢字の全角化) + fileText = NormalizeText(fileText); + result = fileText; // 結果を返す + } + else + { + result = fileText; // 特殊記号が無ければそのまま返す + } + } + catch (Exception ex) + { + LogWriter.WriteLog($"クレンジング処理中にエラーが発生しました: {ex.Message}", LogWriter.LogLevel.ERROR); + Console.WriteLine($"エラーが発生しました: {ex.Message}"); + result = fileText; // エラーが発生した場合は元の文字列を返す + } + + return result; + } + + /// + /// CSVファイルから特殊記号のリストを読み込む + /// + /// CSVファイルのパス + /// 特殊記号のリスト + /// + private static List LoadSpecialCharacters(string csvFilePath) + { + List specialCharacters = new List(); + + try + { + // Shift-JISエンコーディングでファイルを開く + using (var reader = new StreamReader(csvFilePath, System.Text.Encoding.GetEncoding("shift_jis"))) + { + // ファイルが空でないことを確認 + if (!reader.EndOfStream) + { + var line = reader.ReadLine(); // 1行のみ読み込む + + // lineがnullでないことを確認 + if (line != null) + { + var trimmedLine = line.Trim(); + + if (!string.IsNullOrEmpty(trimmedLine)) + { + // 行をカンマで分割して、列を取り出す + var columns = trimmedLine.Split(','); + + // 列ごとにリストに追加 + foreach (var column in columns) + { + specialCharacters.Add(column.Trim()); + } + } + } + } + } + } + catch (Exception ex) + { + LogWriter.WriteLog($"CSVファイルの読み込みでエラーが発生しました: {ex.Message}", LogWriter.LogLevel.ERROR); + Console.WriteLine($"CSVファイルの読み込みエラー: {ex.Message}"); + } + + return specialCharacters; + } + + /// + /// 文字列の正規化(数字半角化、アルファベット小文字化、ひらがなカタカナ漢字の全角化) + /// + /// 入力文字列 + /// 正規化された文字列 + private static string NormalizeText(string inputText) + { + // 1. 全角化 + string result = NormalizeKatakana(inputText); + + // 2. ローマ数字をアラビア数字に変換 + result = NormalizeRomanNumerals(result); + + // 3. アルファベットと数字を半角化 + result = NormalizeAlphabetAndNumbers(result); + + return result; + } + + /// + /// ひらがな、カタカナ、漢字を全角に変換する + /// + /// 入力文字列 + /// 全角に変換された文字列 + private static string NormalizeKatakana(string inputText) + { + // 半角カタカナを全角カタカナに変換 + string result = inputText; + result = result.Replace("ア", "ア") + .Replace("イ", "イ") + .Replace("ウ", "ウ") + .Replace("エ", "エ") + .Replace("オ", "オ") + .Replace("ァ", "ァ") + .Replace("ィ", "ィ") + .Replace("ゥ", "ゥ") + .Replace("ェ", "ェ") + .Replace("ォ", "ォ") + .Replace("カ", "カ") + .Replace("キ", "キ") + .Replace("ク", "ク") + .Replace("ケ", "ケ") + .Replace("コ", "コ") + .Replace("サ", "サ") + .Replace("シ", "シ") + .Replace("ス", "ス") + .Replace("セ", "セ") + .Replace("ソ", "ソ") + .Replace("タ", "タ") + .Replace("チ", "チ") + .Replace("ツ", "ツ") + .Replace("ッ", "ッ") + .Replace("テ", "テ") + .Replace("ト", "ト") + .Replace("ナ", "ナ") + .Replace("ニ", "ニ") + .Replace("ヌ", "ヌ") + .Replace("ネ", "ネ") + .Replace("ノ", "ノ") + .Replace("ハ", "ハ") + .Replace("ヒ", "ヒ") + .Replace("フ", "フ") + .Replace("ヘ", "ヘ") + .Replace("ホ", "ホ") + .Replace("マ", "マ") + .Replace("ミ", "ミ") + .Replace("ム", "ム") + .Replace("メ", "メ") + .Replace("モ", "モ") + .Replace("ヤ", "ヤ") + .Replace("ユ", "ユ") + .Replace("ヨ", "ヨ") + .Replace("ャ", "ャ") + .Replace("ュ", "ュ") + .Replace("ョ", "ョ") + .Replace("ラ", "ラ") + .Replace("リ", "リ") + .Replace("ル", "ル") + .Replace("レ", "レ") + .Replace("ロ", "ロ") + .Replace("ワ", "ワ") + .Replace("ヲ", "ヲ") + .Replace("ン", "ン") + .Replace("ー", "ー"); + return result; + } + + /// + /// ローマ数字をアラビア数字に変換 + /// + /// 入力文字列 + /// ローマ数字がアラビア数字に変換された文字列 + private static string NormalizeRomanNumerals(string inputText) + { + // ローマ数字のパターンを正規表現でマッチさせる + string pattern = @"[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫⅬⅭⅮⅯⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹⅺⅻⅼⅽⅾ]+"; + + // 正規表現でローマ数字部分を検索し、それらをアラビア数字に変換 + return Regex.Replace(inputText, pattern, match => ConvertRomanToArabic(match.Value)); + } + + // ローマ数字をアラビア数字に変換 + private static string ConvertRomanToArabic(string roman) + { + // ローマ数字の対応を定義 + Dictionary romanToArabicMap = new Dictionary + { + { 'Ⅰ', 1 }, { 'Ⅱ', 2 }, { 'Ⅲ', 3 }, { 'Ⅳ', 4 }, { 'Ⅴ', 5 }, + { 'Ⅵ', 6 }, { 'Ⅶ', 7 }, { 'Ⅷ', 8 }, { 'Ⅸ', 9 }, { 'Ⅹ', 10 }, + { 'Ⅺ', 11 }, { 'Ⅻ', 12 }, { 'Ⅼ', 50 }, { 'Ⅽ', 100 }, { 'Ⅾ', 500 }, { 'Ⅿ', 1000 }, + { 'ⅰ', 1 }, { 'ⅱ', 2 }, { 'ⅲ', 3 }, { 'ⅳ', 4 }, { 'ⅴ', 5 }, + { 'ⅵ', 6 }, { 'ⅶ', 7 }, { 'ⅷ', 8 }, { 'ⅸ', 9 }, { 'ⅹ', 10 }, + { 'ⅺ', 11 }, { 'ⅻ', 12 }, { 'ⅼ', 50 }, { 'ⅽ', 100 }, { 'ⅾ', 500 } + }; + + int total = 0; + int previousValue = 0; + + // 文字列を逆順で処理して、引き算と足し算を判定 + for (int i = roman.Length - 1; i >= 0; i--) + { + char currentChar = roman[i]; + + // ローマ数字に含まれる文字かチェック + if (romanToArabicMap.ContainsKey(currentChar)) + { + int currentValue = romanToArabicMap[currentChar]; + + // 次の文字が現在の文字より大きい場合は引き算 + if (currentValue < previousValue) + { + total -= currentValue; + } + else + { + total += currentValue; + } + + // 現在の値を次の文字の判定に使用 + previousValue = currentValue; + } + else + { + // ローマ数字に含まれない文字がある場合はスキップ + continue; + } + } + + return total.ToString(); + } + + /// + /// アルファベットと数字を半角に変換する(アルファベットは小文字化) + /// + /// 入力文字列 + /// 半角小文字アルファベット、半角数字に変換された文字列 + private static string NormalizeAlphabetAndNumbers(string inputText) + { + // アルファベットを小文字にし、半角に変換 + string result = inputText.ToLower(); // 小文字化 + result = ToHalfWidth(result); // 半角化 + + // 数字を半角に変換(これは既に半角化されている場合もあるので、念のため再度変換) + result = NormalizeNumbers(result); // 数字の半角化 + + return result; + } + + /// + /// 数字を半角に変換する + /// + /// 入力文字列 + /// 半角数字に変換された文字列 + private static string NormalizeNumbers(string inputText) + { + return inputText.Replace("0", "0") + .Replace("1", "1") + .Replace("2", "2") + .Replace("3", "3") + .Replace("4", "4") + .Replace("5", "5") + .Replace("6", "6") + .Replace("7", "7") + .Replace("8", "8") + .Replace("9", "9"); + } + + /// + /// 全角文字を半角に変換する(アルファベット・数字) + /// + /// 入力文字列 + /// 半角に変換された文字列 + private static string ToHalfWidth(string inputText) + { + char[] chars = inputText.ToCharArray(); + for (int i = 0; i < chars.Length; i++) + { + // アルファベット(小文字・大文字)を半角化 + if (chars[i] >= 'A' && chars[i] <= 'Z') + { + chars[i] = (char)(chars[i] - 'A' + 'A'); // 大文字を半角 + } + else if (chars[i] >= 'a' && chars[i] <= 'z') + { + chars[i] = (char)(chars[i] - 'a' + 'a'); // 小文字を半角 + } + // 数字を半角に変換 + else if (chars[i] >= '0' && chars[i] <= '9') + { + chars[i] = (char)(chars[i] - '0' + '0'); // 数字を半角 + } + } + return new string(chars); + } + } +} diff --git a/FY2526-SW-PoC-APIRelay/OcrHelper.cs b/FY2526-SW-PoC-APIRelay/OcrHelper.cs new file mode 100644 index 0000000..59d9e56 --- /dev/null +++ b/FY2526-SW-PoC-APIRelay/OcrHelper.cs @@ -0,0 +1,123 @@ +using FY2526_SW_PoC_APIRelay; +using ImageMagick; +using PretreatmentFile; +using System; +using System.IO; +using System.Threading.Tasks; +using Windows.Graphics.Imaging; +using Windows.Media.Ocr; +using Windows.Storage; +using Windows.Storage.Streams; + +namespace PretreatmentFile +{ + public class OcrHelper + { + /// OCR + /// ファイルパス + /// 読み取った文字列 + /// + public static async Task Ocr(string filePath) + { + try + { + string extension = Path.GetExtension(filePath).ToLower(); + string? convertedFilePath = null; + + if (extension == ".heic") + { + try + { + convertedFilePath = ConvertHeicToJpeg(filePath); + if (convertedFilePath == null) + { + LogWriter.WriteLog("HEICファイルの変換に失敗しました", LogWriter.LogLevel.ERROR); + return null; + } + filePath = convertedFilePath; + } + catch (Exception conversionException) + { + LogWriter.WriteLog($"HEICからJPEGへの変換中にエラーが発生しました: {conversionException.Message}", LogWriter.LogLevel.ERROR); + return null; + } + } + + // ファイルを開き、BitmapDecoderを作成 + var file = await StorageFile.GetFileFromPathAsync(filePath); + var stream = await file.OpenAsync(FileAccessMode.Read); + var decoder = await BitmapDecoder.CreateAsync(stream); + var bmp = await decoder.GetSoftwareBitmapAsync(); + + try + { + var engine = OcrEngine.TryCreateFromLanguage(new Windows.Globalization.Language("ja")); + if (engine != null) + { + var result = await engine.RecognizeAsync(bmp); + + // OcrResult.Lines から各行のテキストを取得し、改行で結合 + var extractedText = string.Join(Environment.NewLine, result.Lines.Select(line => line.Text)); + + if (convertedFilePath != null && File.Exists(convertedFilePath)) + { + try + { + File.Delete(convertedFilePath); // JPEGファイルを削除 + } + catch (Exception deleteException) + { + LogWriter.WriteLog($"JPEGファイルの削除に失敗しました: {deleteException.Message}", LogWriter.LogLevel.ERROR); + } + } + + return extractedText; + } + else + { + LogWriter.WriteLog("OCRエンジンの作成に失敗しました", LogWriter.LogLevel.ERROR); + return null; + } + } + catch (Exception ocrException) + { + LogWriter.WriteLog($"OCR認識に失敗しました: {ocrException.Message}", LogWriter.LogLevel.ERROR); + return null; + } + } + catch (Exception e) + { + LogWriter.WriteLog($"予期しないエラーが発生しました: {e.Message}", LogWriter.LogLevel.ERROR); + return null; + } + } + + + /// heicをjpegに変換する + /// heicファイルのパス + /// 変換後のjpegファイルのパス + /// + private static string? ConvertHeicToJpeg(string heicFilePath) + { + try + { + // Magick.NETを使用してHEICファイルをJPEGに変換 + string outputFilePath = heicFilePath.Replace(".heic", ".jpg", StringComparison.OrdinalIgnoreCase); + + using (var image = new MagickImage(heicFilePath)) + { + // JPEGとして保存 + image.Format = MagickFormat.Jpeg; + image.Write(outputFilePath); + } + + return outputFilePath; + } + catch (Exception e) + { + LogWriter.WriteLog($"HEICファイルの変換中にエラーが発生しました: {e.Message}", LogWriter.LogLevel.ERROR); + return null; // 変換に失敗した場合はnullを返す + } + } + } +} \ No newline at end of file diff --git a/FY2526-SW-PoC-APIRelay/Program.cs b/FY2526-SW-PoC-APIRelay/Program.cs index e8efa18..b56dee2 100644 --- a/FY2526-SW-PoC-APIRelay/Program.cs +++ b/FY2526-SW-PoC-APIRelay/Program.cs @@ -1,5 +1,8 @@ -namespace FY2526_SW_PoC_APIRelay +using System.Reflection; + +namespace FY2526_SW_PoC_APIRelay { + internal class Program { private enum ResultCode : int @@ -11,20 +14,125 @@ Error = 5000, ErrorReadFile = 5001, ErrorCleansing = 5002, + FileAlreadyExists = 6001, + FileNotSupported = 6002 } /// メイン関数 /// 実行引数 [STAThread] - static void Main(string[] args) + static async Task Main(string[] args) { - LogWriter.WriteLog("PretreatmentFile 開始", LogWriter.LogLevel.INFO); + LogWriter.WriteLog("11ファイル前処理 開始", LogWriter.LogLevel.INFO); + AppDomain.CurrentDomain.ProcessExit += (s, e) => CleanupPythonProcesses(); + AppDomain.CurrentDomain.UnhandledException += (s, e) => CleanupPythonProcesses(); + Console.CancelKeyPress += (sender, e) => CleanupPythonProcesses(); - LogWriter.WriteDebugLog(string.Join(" , ", args)); + try + { + // 入力パラメータチェック + if (args.Length != 2) + { + System.Console.Write("{0} Usage: FY2526-SW-PoC-APIRelay.exe \"ファイルパス\" \"特殊記号が記載されたCSVのパス\"", (int)ResultCode.BadParameter); + return; + } - System.Console.Write("{0}", (int)ResultCode.Success); + string? filePath; + string? csvPath; + String? fileContent; - LogWriter.WriteLog("PretreatmentFile 終了", LogWriter.LogLevel.INFO); + filePath = args[0]; + csvPath = args[1]; + //System.Console.Write(string.Format("入力パラメータ ファイルパス:{0} CSVパス:{1}", filePath, csvPath), LogWriter.LogLevel.DEBUG); + var fileType = FileDiscrimination.GetFileType(filePath); + if (fileType == FileType.Other) + { + LogWriter.WriteLog($"対応していないファイル形式です: {filePath}", LogWriter.LogLevel.INFO); + Console.WriteLine($"{(int)ResultCode.FileNotSupported}"); + return; + } + + Task isFileExist = FileDiscrimination.FileExistCheck(filePath); + if (isFileExist.Result) + { + LogWriter.WriteLog($"ファイル既に存在する: {filePath}", LogWriter.LogLevel.INFO); + Console.WriteLine($"{(int)ResultCode.FileAlreadyExists}"); + return; + } + + switch (fileType) + { + case FileType.Image: + fileContent = await GetDataFromFile.Ocr(filePath); // 非同期メソッドを呼び出し + Console.WriteLine($"{fileContent}"); + // OCR + break; + case FileType.Document: + fileContent = GetDataFromFile.ReadDocument(filePath); + Console.WriteLine($"{fileContent}"); + // 読み出し + break; + default: + LogWriter.WriteLog($"対応していないファイル形式です: {filePath}", LogWriter.LogLevel.INFO); + Console.WriteLine($"{(int)ResultCode.FileNotSupported}"); + return; + } + + if (fileContent == null) + { + System.Console.Write("{0}", (int)ResultCode.ErrorReadFile); + LogWriter.WriteLog($"ファイルの読み込みに失敗しました。結果コード:{(int)ResultCode.ErrorReadFile}", LogWriter.LogLevel.ERROR); + return; + } + + // クレンジング + var cleansingText = GetDataFromFile.CleansingText(fileContent, csvPath); + if (cleansingText == null) + { + System.Console.Write("{0}", (int)ResultCode.ErrorCleansing); + LogWriter.WriteLog($"クレンジングに失敗しました。結果コード:{(int)ResultCode.ErrorCleansing}", LogWriter.LogLevel.ERROR); + return; + } + + + Console.Write($"{cleansingText}"); + } + catch (Exception ex) + { + LogWriter.WriteLog($"実行エラー : {ex.Message}", LogWriter.LogLevel.ERROR); + Console.WriteLine($"{(int)ResultCode.Error}"); + } + finally + { + CleanupPythonProcesses(); + } + } + + + /// Pythonプロセスクリア + private static void CleanupPythonProcesses() + { + //foreach (var process in FY2526_SW_PoC_APIRelay.SearchInVectorDB.pythonProcesses.ToArray()) + //{ + // try + // { + // if (!process.HasExited) + // { + // LogWriter.WriteLog($"プロセスクリアされました", LogWriter.LogLevel.INFO); + // process.Kill(); + // process.WaitForExit(1000); + // } + // } + // catch (Exception ex) + // { + // LogWriter.WriteLog($"プロセスクリアエラー発生: {ex.Message}", LogWriter.LogLevel.ERROR); + // } + // finally + // { + // process.Dispose(); + // } + //} + //VectorSearch.SearchInVectorDB.pythonProcesses.Clear(); } } } \ No newline at end of file