1133 lines
46 KiB
C#
1133 lines
46 KiB
C#
using ImageMagick;
|
||
using InfoExtraction;
|
||
using Microsoft.Office.Interop.Excel;
|
||
using Microsoft.Office.Interop.PowerPoint;
|
||
using Microsoft.Office.Interop.Word;
|
||
using PdfiumViewer;
|
||
using System.Text;
|
||
using System.Text.RegularExpressions;
|
||
using Windows.Graphics.Imaging;
|
||
using Windows.Media.Ocr;
|
||
using Windows.Storage;
|
||
using ExcelApplication = Microsoft.Office.Interop.Excel.Application;
|
||
using PowerPointApplication = Microsoft.Office.Interop.PowerPoint.Application;
|
||
using WordApplication = Microsoft.Office.Interop.Word.Application;
|
||
|
||
namespace FY2526_SW_PoC_APIRelay
|
||
{
|
||
internal class GetDataFromFile
|
||
{
|
||
/// <summary>OCR</summary>
|
||
/// <param name="filePath">ファイルパス</param>
|
||
/// <returns>読み取った文字列</returns>
|
||
///
|
||
public static async Task<string?> Ocr(string filePath)
|
||
{
|
||
try
|
||
{
|
||
string extension = Path.GetExtension(filePath).ToLower();
|
||
string? convertedFilePath = null;
|
||
|
||
if (extension == ".heic")
|
||
{
|
||
try
|
||
{
|
||
convertedFilePath = ConvertHeicToJpeg(filePath);
|
||
if (convertedFilePath == null)
|
||
{
|
||
LogWriter.WriteLog("HEICファイルの変換に失敗しました", LogWriter.LogLevel.ERROR);
|
||
return null;
|
||
}
|
||
filePath = convertedFilePath;
|
||
}
|
||
catch (Exception conversionException)
|
||
{
|
||
LogWriter.WriteLog($"HEICからJPEGへの変換中にエラーが発生しました: {conversionException.Message}", LogWriter.LogLevel.ERROR);
|
||
return null;
|
||
}
|
||
}
|
||
|
||
// ファイルを開き、BitmapDecoderを作成
|
||
var file = await StorageFile.GetFileFromPathAsync(filePath);
|
||
var stream = await file.OpenAsync(FileAccessMode.Read);
|
||
var decoder = await BitmapDecoder.CreateAsync(stream);
|
||
var bmp = await decoder.GetSoftwareBitmapAsync();
|
||
|
||
try
|
||
{
|
||
var engine = OcrEngine.TryCreateFromLanguage(new Windows.Globalization.Language("ja"));
|
||
if (engine != null)
|
||
{
|
||
var result = await engine.RecognizeAsync(bmp);
|
||
|
||
// OcrResult.Lines から各行のテキストを取得し、改行で結合
|
||
var extractedText = string.Join(Environment.NewLine, result.Lines.Select(line => line.Text));
|
||
|
||
if (convertedFilePath != null && File.Exists(convertedFilePath))
|
||
{
|
||
try
|
||
{
|
||
File.Delete(convertedFilePath); // JPEGファイルを削除
|
||
}
|
||
catch (Exception deleteException)
|
||
{
|
||
LogWriter.WriteLog($"JPEGファイルの削除に失敗しました: {deleteException.Message}", LogWriter.LogLevel.ERROR);
|
||
}
|
||
}
|
||
|
||
return extractedText;
|
||
}
|
||
else
|
||
{
|
||
LogWriter.WriteLog("OCRエンジンの作成に失敗しました", LogWriter.LogLevel.ERROR);
|
||
return null;
|
||
}
|
||
}
|
||
catch (Exception ocrException)
|
||
{
|
||
LogWriter.WriteLog($"OCR認識に失敗しました: {ocrException.Message}", LogWriter.LogLevel.ERROR);
|
||
return null;
|
||
}
|
||
}
|
||
catch (Exception e)
|
||
{
|
||
LogWriter.WriteLog($"予期しないエラーが発生しました: {e.Message}", LogWriter.LogLevel.ERROR);
|
||
return null;
|
||
}
|
||
}
|
||
|
||
/// <summary>heicをjpegに変換する</summary>
|
||
/// <param name="heicFilePath">heicファイルのパス</param>
|
||
/// <returns>変換後のjpegファイルのパス</returns>
|
||
///
|
||
private static string? ConvertHeicToJpeg(string heicFilePath)
|
||
{
|
||
try
|
||
{
|
||
// Magick.NETを使用してHEICファイルをJPEGに変換
|
||
string outputFilePath = heicFilePath.Replace(".heic", ".jpg", StringComparison.OrdinalIgnoreCase);
|
||
|
||
using (var image = new MagickImage(heicFilePath))
|
||
{
|
||
// JPEGとして保存
|
||
image.Format = MagickFormat.Jpeg;
|
||
image.Write(outputFilePath);
|
||
}
|
||
|
||
return outputFilePath;
|
||
}
|
||
catch (Exception e)
|
||
{
|
||
LogWriter.WriteLog($"HEICファイルの変換中にエラーが発生しました: {e.Message}", LogWriter.LogLevel.ERROR);
|
||
return null; // 変換に失敗した場合はnullを返す
|
||
}
|
||
}
|
||
|
||
public static string? ReadDocument(string filePath)
|
||
{
|
||
try
|
||
{
|
||
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
|
||
string extension = Path.GetExtension(filePath).ToLower();
|
||
string extractedText = string.Empty;
|
||
string? result = null;
|
||
|
||
switch (Path.GetExtension(filePath.ToLower()))
|
||
{
|
||
case ".txt":
|
||
byte[] firstData = new byte[1000];
|
||
//ゼロサイズのutf-16LE.BE 対応
|
||
firstData[2] = 0xFF;
|
||
firstData[3] = 0xFF;
|
||
int readCount;
|
||
using (FileStream fs = new(filePath, FileMode.Open))
|
||
{
|
||
readCount = fs.Read(firstData, 0, 1000);
|
||
}
|
||
|
||
using (StreamReader sr = new(filePath, GetEncoding(firstData, readCount)))
|
||
{
|
||
result = sr.ReadToEnd();
|
||
}
|
||
break;
|
||
case ".pdf":
|
||
using (PdfDocument doc = PdfDocument.Load(filePath))
|
||
{
|
||
for (var pageNum = 0; pageNum < doc.PageCount; pageNum++)
|
||
{
|
||
result += doc.GetPdfText(pageNum);
|
||
}
|
||
}
|
||
break;
|
||
case ".docx":
|
||
case ".doc":
|
||
using (var word = new ComWrapper<WordApplication>(new WordApplication() { Visible = false, DisplayAlerts = WdAlertLevel.wdAlertsNone }))
|
||
using (var docs = new ComWrapper<Documents>(word.ComObject.Documents))
|
||
{
|
||
using var doc = new ComWrapper<Document>(docs.ComObject.Open(filePath,
|
||
ReadOnly: true,
|
||
AddToRecentFiles: false,
|
||
Visible: false));
|
||
var tempFiles = new string[2];
|
||
|
||
bool success = false;
|
||
try
|
||
{
|
||
tempFiles[0] = Path.GetTempFileName();
|
||
tempFiles[1] = Path.GetTempFileName();
|
||
|
||
// Wordドキュメントの全テキストを取得
|
||
string fullText = doc.ComObject.Content.Text;
|
||
|
||
// テキストを一時ファイルに保存
|
||
File.WriteAllText(tempFiles[0], fullText, Encoding.UTF8);
|
||
|
||
// テキストボックス内のテキストやコメントも抽出
|
||
List<string> otherContents = new List<string>();
|
||
|
||
foreach (Microsoft.Office.Interop.Word.Shape shape in doc.ComObject.Shapes)
|
||
{
|
||
// 図形のテキスト抽出
|
||
ExtractShapeContents(otherContents, shape);
|
||
}
|
||
foreach (Microsoft.Office.Interop.Word.Comment comment in doc.ComObject.Comments)
|
||
{
|
||
// コメントの保存
|
||
otherContents.Add(comment.Author + ":" + comment.Range.Text);
|
||
}
|
||
|
||
// 他のテキストを一時ファイルに保存
|
||
File.WriteAllLines(tempFiles[1], otherContents, Encoding.GetEncoding("UTF-8"));
|
||
|
||
success = true;
|
||
}
|
||
catch
|
||
{
|
||
throw;
|
||
}
|
||
finally
|
||
{
|
||
doc.ComObject.Close(false);
|
||
if (success)
|
||
{
|
||
// マージ
|
||
result = MergeTextContents(tempFiles);
|
||
}
|
||
DeleteFiles(tempFiles);
|
||
}
|
||
}
|
||
break;
|
||
case ".xlsx":
|
||
case ".xls":
|
||
using (var excel = new ComWrapper<ExcelApplication>(new ExcelApplication() { Visible = false, DisplayAlerts = false }))
|
||
using (var books = new ComWrapper<Workbooks>(excel.ComObject.Workbooks))
|
||
{
|
||
using var book = new ComWrapper<Workbook>(books.ComObject.Open(filePath,
|
||
UpdateLinks: XlUpdateLinks.xlUpdateLinksNever,
|
||
ReadOnly: true,
|
||
IgnoreReadOnlyRecommended: true,
|
||
Editable: false));
|
||
|
||
List<string> contents = new();
|
||
List<string> tempFiles = new();
|
||
|
||
bool success = false;
|
||
try
|
||
{
|
||
for (int i = 1; i <= book.ComObject.Worksheets.Count; i++)
|
||
{
|
||
using var sheet = new ComWrapper<Worksheet>(book.ComObject.Worksheets[i]);
|
||
|
||
var tempFile1 = Path.GetTempFileName();
|
||
tempFiles.Add(tempFile1);
|
||
|
||
// シート内のテキストを抽出
|
||
List<string> sheetContents = new();
|
||
foreach (Microsoft.Office.Interop.Excel.Range cell in sheet.ComObject.UsedRange)
|
||
{
|
||
// セルの内容を追加
|
||
sheetContents.Add(cell.Text.ToString());
|
||
}
|
||
|
||
// シート内のテキストを一時ファイルに保存
|
||
File.WriteAllLines(tempFile1, sheetContents, Encoding.GetEncoding("UTF-8"));
|
||
|
||
// 図形内のテキスト抽出
|
||
List<string> otherContents = new();
|
||
foreach (Microsoft.Office.Interop.Excel.Shape shape in sheet.ComObject.Shapes)
|
||
{
|
||
// 図形のテキスト抽出
|
||
ExtractShapesContents(otherContents, shape);
|
||
}
|
||
|
||
// コメントの抽出
|
||
foreach (Microsoft.Office.Interop.Excel.CommentThreaded comment in sheet.ComObject.CommentsThreaded)
|
||
{
|
||
// コメントのテキストを追加
|
||
otherContents.Add(comment.Author.Name + ":" + comment.Text());
|
||
}
|
||
|
||
// メモの抽出
|
||
foreach (Microsoft.Office.Interop.Excel.Comment memo in sheet.ComObject.Comments)
|
||
{
|
||
// メモのテキストを追加
|
||
otherContents.Add(memo.Author + ":" + memo.Text());
|
||
}
|
||
|
||
var tempFile2 = Path.GetTempFileName();
|
||
tempFiles.Add(tempFile2);
|
||
File.WriteAllLines(tempFile2, otherContents, Encoding.GetEncoding("UTF-8"));
|
||
}
|
||
|
||
success = true;
|
||
}
|
||
catch
|
||
{
|
||
throw;
|
||
}
|
||
finally
|
||
{
|
||
book.ComObject.Close(false);
|
||
if (success)
|
||
{
|
||
// 一時ファイルから内容をマージ
|
||
foreach (var tempFile in tempFiles)
|
||
{
|
||
var sheetContents = MergeTextContents(new string[] { tempFile });
|
||
contents.Add(sheetContents);
|
||
File.Delete(tempFile);
|
||
}
|
||
|
||
// 結果として、全ての内容を結合
|
||
result = string.Join(string.Empty, contents);
|
||
}
|
||
|
||
// 一時ファイルの削除
|
||
DeleteFiles(tempFiles.ToArray());
|
||
}
|
||
}
|
||
break;
|
||
case ".pptx":
|
||
case ".ppt":
|
||
using (var powerPoint = new ComWrapper<PowerPointApplication>(new PowerPointApplication() { DisplayAlerts = PpAlertLevel.ppAlertsNone }))
|
||
using (var ppts = new ComWrapper<Presentations>(powerPoint.ComObject.Presentations))
|
||
{
|
||
using var ppt = new ComWrapper<Presentation>(ppts.ComObject.Open(filePath,
|
||
ReadOnly: Microsoft.Office.Core.MsoTriState.msoTrue,
|
||
WithWindow: Microsoft.Office.Core.MsoTriState.msoFalse));
|
||
|
||
var contents = new List<string>();
|
||
var tempFiles = new string[1];
|
||
var success = false;
|
||
try
|
||
{
|
||
tempFiles[0] = Path.GetTempFileName();
|
||
|
||
// 図形のテキストとコメントの抽出
|
||
var slideContents = new List<string>();
|
||
foreach (Slide slide in ppt.ComObject.Slides)
|
||
{
|
||
foreach (Microsoft.Office.Interop.PowerPoint.Shape shape in slide.Shapes)
|
||
{
|
||
ExtractShapeContents(slideContents, shape);
|
||
}
|
||
foreach (Microsoft.Office.Interop.PowerPoint.Comment comment in slide.Comments)
|
||
{
|
||
slideContents.Add(comment.Author + ":" + comment.Text);
|
||
}
|
||
slideContents.Add(slide.NotesPage.Shapes.Placeholders[2].TextFrame.TextRange.Text);//placefolders[1] is slide itself.
|
||
}
|
||
File.WriteAllLines(tempFiles[0], slideContents, Encoding.GetEncoding("UTF-8"));
|
||
success = true;
|
||
}
|
||
finally
|
||
{
|
||
ppt.ComObject.Close();
|
||
if (success)
|
||
{
|
||
result = MergeTextContents(tempFiles);
|
||
}
|
||
DeleteFiles(tempFiles);
|
||
}
|
||
break;
|
||
}
|
||
default:
|
||
throw new ArgumentException("ドキュメントファイルではありません", nameof(filePath));
|
||
}
|
||
return result;
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
System.Console.Write(ex.Message);
|
||
System.Console.Write(ex.StackTrace);
|
||
LogWriter.WriteLog($"読み出しでエラー発生: {ex.Message}", LogWriter.LogLevel.ERROR);
|
||
throw;
|
||
}
|
||
}
|
||
|
||
private static void ExtractShapeContents(List<string> contents, Microsoft.Office.Interop.Word.Shape shape)
|
||
{
|
||
shape.Select();
|
||
if (shape.Type == Microsoft.Office.Core.MsoShapeType.msoGroup)
|
||
{
|
||
foreach (Microsoft.Office.Interop.Word.Shape subShape in shape.GroupItems)
|
||
{
|
||
// グループ内図形に対して再帰呼び出し
|
||
ExtractShapeContents(contents, subShape);
|
||
}
|
||
}
|
||
else if (shape.Type == Microsoft.Office.Core.MsoShapeType.msoCanvas)
|
||
{
|
||
foreach (Microsoft.Office.Interop.Word.Shape subShape in shape.CanvasItems)
|
||
{
|
||
// キャンバス内図形に対して再帰呼び出し
|
||
ExtractShapeContents(contents, subShape);
|
||
}
|
||
}
|
||
else
|
||
{
|
||
if (shape.TextFrame != null && shape.TextFrame.HasText != 0)
|
||
{
|
||
// 図形内テキストの保存
|
||
var text = shape.TextFrame?.TextRange?.Text;
|
||
if (!string.IsNullOrEmpty(text))
|
||
{
|
||
contents.Add(text);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
private static void ExtractShapesContents(List<string> contents, Microsoft.Office.Interop.Excel.Shape shape)
|
||
{
|
||
if (shape.Type == Microsoft.Office.Core.MsoShapeType.msoGroup)
|
||
{
|
||
foreach (Microsoft.Office.Interop.Excel.Shape subShape in shape.GroupItems)
|
||
{
|
||
ExtractShapesContents(contents, subShape);
|
||
}
|
||
}
|
||
else if (shape.Type == Microsoft.Office.Core.MsoShapeType.msoCanvas)
|
||
{
|
||
foreach (Microsoft.Office.Interop.Excel.Shape subShape in shape.CanvasItems)
|
||
{
|
||
ExtractShapesContents(contents, subShape);
|
||
}
|
||
}
|
||
else
|
||
{
|
||
try
|
||
{
|
||
var text = shape.TextEffect?.Text;
|
||
if (!string.IsNullOrEmpty(text))
|
||
{
|
||
contents.Add(text);
|
||
}
|
||
}
|
||
catch
|
||
{
|
||
}
|
||
}
|
||
}
|
||
|
||
private static void ExtractShapeContents(List<string> contents, Microsoft.Office.Interop.PowerPoint.Shape shape)
|
||
{
|
||
if (shape.Type == Microsoft.Office.Core.MsoShapeType.msoGroup)
|
||
{
|
||
foreach (Microsoft.Office.Interop.PowerPoint.Shape subShape in shape.GroupItems)
|
||
{
|
||
ExtractShapeContents(contents, subShape);
|
||
}
|
||
}
|
||
else if (shape.Type == Microsoft.Office.Core.MsoShapeType.msoCanvas)
|
||
{
|
||
foreach (Microsoft.Office.Interop.PowerPoint.Shape subShape in shape.CanvasItems)
|
||
{
|
||
ExtractShapeContents(contents, subShape);
|
||
}
|
||
}
|
||
else
|
||
{
|
||
if (shape.TextFrame != null && shape.TextFrame.HasText == Microsoft.Office.Core.MsoTriState.msoTrue)
|
||
{
|
||
var text = shape.TextFrame?.TextRange?.Text;
|
||
if (!string.IsNullOrEmpty(text))
|
||
{
|
||
contents.Add(text);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
private static string MergeTextContents(string[] tempFiles)
|
||
{
|
||
var contents = new List<string>();
|
||
foreach (var tempFile in tempFiles)
|
||
{
|
||
if (File.Exists(tempFile))
|
||
{
|
||
contents.AddRange(File.ReadAllLines(tempFile, Encoding.GetEncoding("Shift_JIS")));
|
||
}
|
||
}
|
||
return string.Join("", contents);
|
||
}
|
||
|
||
private static void DeleteFiles(string[] tempFiles)
|
||
{
|
||
foreach (var tempFile in tempFiles)
|
||
{
|
||
if (File.Exists(tempFile))
|
||
{
|
||
File.Delete(tempFile);
|
||
}
|
||
}
|
||
}
|
||
|
||
private static Encoding GetEncoding(byte[] firstData, int dataLength)
|
||
{
|
||
if (dataLength < 2)
|
||
{
|
||
return Encoding.GetEncoding("Shift_JIS");
|
||
}
|
||
|
||
if ((firstData[0] == 0xfe) && (firstData[1] == 0xff))
|
||
{
|
||
// UTF-16 BE
|
||
return new UnicodeEncoding(true, true);
|
||
}
|
||
|
||
if ((firstData[0] == 0xff) && (firstData[1] == 0xfe))
|
||
{
|
||
if ((4 <= firstData.Length) &&
|
||
(firstData[2] == 0x00) && (firstData[3] == 0x00))
|
||
{
|
||
// UTF-32 LE
|
||
return new UTF32Encoding(false, true);
|
||
}
|
||
// UTF-16 LE
|
||
return new UnicodeEncoding(false, true);
|
||
}
|
||
|
||
if (dataLength < 3)
|
||
{
|
||
return Encoding.GetEncoding("Shift_JIS");
|
||
}
|
||
|
||
if ((firstData[0] == 0xef) && (firstData[1] == 0xbb) && (firstData[2] == 0xbf))
|
||
{
|
||
//UTF-8
|
||
return new UTF8Encoding(true, true);
|
||
}
|
||
|
||
if (dataLength < 4)
|
||
{
|
||
return Encoding.GetEncoding("Shift_JIS");
|
||
}
|
||
|
||
if ((firstData[0] == 0x00) && (firstData[1] == 0x00) &&
|
||
(firstData[2] == 0xfe) && (firstData[3] == 0xff))
|
||
{
|
||
// UTF-32 BE
|
||
return new UTF32Encoding(true, true);
|
||
}
|
||
|
||
// BOMなし
|
||
var ret = JISEncodingJudgment(firstData, dataLength);
|
||
if (ret.Item1 == false)
|
||
{
|
||
if (ret.Item2)
|
||
{
|
||
return Encoding.GetEncoding("iso-2022-jp");
|
||
}
|
||
else
|
||
{
|
||
return Encoding.GetEncoding("us-ascii");
|
||
}
|
||
}
|
||
|
||
var outOfSpecification = Utf8EncodingJudgment(firstData, dataLength);
|
||
if (outOfSpecification == false)
|
||
{
|
||
//UTF-8
|
||
return new UTF8Encoding(true, true);
|
||
}
|
||
|
||
outOfSpecification = EUCJPEncodingJudgment(firstData, dataLength);
|
||
if (outOfSpecification == false)
|
||
{
|
||
return Encoding.GetEncoding("EUC-JP");
|
||
}
|
||
|
||
return Encoding.GetEncoding("Shift_JIS");
|
||
}
|
||
|
||
private static bool IsMatched(byte[] data, byte[] bom)
|
||
{
|
||
bool result = true;
|
||
|
||
for (int i = 0; i < bom.Length; i++)
|
||
{
|
||
if (bom[i] != data[i])
|
||
{
|
||
result = false;
|
||
break;
|
||
}
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
private static (bool, bool) JISEncodingJudgment(byte[] buffer, int sizeOfBuffer)
|
||
{
|
||
bool result = false;
|
||
bool esc1 = false;
|
||
bool esc2 = false;
|
||
byte[] byteESC1 = { 0x1B, 0x28, 0x42 };
|
||
byte[] byteESC2 = { 0x1B, 0x24, 0x42 };
|
||
byte[] backESC = { 0, 0, 0 };
|
||
|
||
for (int i = 0; i < sizeOfBuffer; i++)
|
||
{
|
||
if (0x80 <= buffer[i])
|
||
{
|
||
result = true;
|
||
break;
|
||
}
|
||
else
|
||
{
|
||
backESC[0] = backESC[1];
|
||
backESC[1] = backESC[2];
|
||
backESC[2] = buffer[i];
|
||
if (esc1 == false && IsMatched(backESC, byteESC1))
|
||
{
|
||
esc1 = true;
|
||
}
|
||
if (esc2 == false && IsMatched(backESC, byteESC2))
|
||
{
|
||
esc2 = true;
|
||
}
|
||
}
|
||
}
|
||
|
||
return (result, esc1 || esc2);
|
||
}
|
||
|
||
private static bool Utf8EncodingJudgment(byte[] buffer, int sizeOfBuffer)
|
||
{
|
||
bool outOfSpecification;
|
||
|
||
outOfSpecification = false;
|
||
uint[] byteChar = new uint[6];
|
||
int byteCharCount = 0;
|
||
|
||
for (int i = 0; i < sizeOfBuffer; i++)
|
||
{
|
||
//2バイト文字以上である
|
||
if (0x80 <= buffer[i])
|
||
{
|
||
//2バイト文字
|
||
uint char2byte = (uint)0b11100000 & buffer[i];
|
||
if (char2byte == 0b11000000)
|
||
{
|
||
//セカンドコード数が規格より少なければ規格外
|
||
outOfSpecification = Utf8OutOfSpecification(byteChar[0], byteCharCount, false);
|
||
if (outOfSpecification)
|
||
{
|
||
break;
|
||
}
|
||
|
||
byteChar[0] = char2byte;
|
||
byteCharCount = 1;
|
||
continue;
|
||
}
|
||
|
||
//3バイト文字
|
||
uint char3byte = (uint)0b11110000 & (uint)buffer[i];
|
||
if (char3byte == 0b11100000)
|
||
{
|
||
//セカンドコード数が規格より少なければ規格外
|
||
outOfSpecification = Utf8OutOfSpecification(byteChar[0], byteCharCount, false);
|
||
if (outOfSpecification)
|
||
{
|
||
break;
|
||
}
|
||
|
||
byteChar[0] = char3byte;
|
||
byteCharCount = 1;
|
||
continue;
|
||
}
|
||
|
||
//4バイト文字
|
||
uint char4byte = (uint)0b11111000 & (uint)buffer[i];
|
||
if (char4byte == 0b11110000)
|
||
{
|
||
//セカンドコード数が規格より少なければ規格外
|
||
outOfSpecification = Utf8OutOfSpecification(byteChar[0], byteCharCount, false);
|
||
if (outOfSpecification)
|
||
{
|
||
break;
|
||
}
|
||
|
||
byteChar[0] = char4byte;
|
||
byteCharCount = 1;
|
||
continue;
|
||
}
|
||
|
||
//2バイト目以降のコード
|
||
uint charSecond = (uint)0b11000000 & (uint)buffer[i];
|
||
if (charSecond == 0b10000000)
|
||
{
|
||
// 文字の先頭がセカンドコードなら規格外
|
||
if (byteCharCount < 1)
|
||
{
|
||
outOfSpecification = true;
|
||
break;
|
||
}
|
||
|
||
//セカンドコードを保存
|
||
byteChar[byteCharCount] = charSecond;
|
||
byteCharCount++;
|
||
|
||
//セカンドコード数が規格より多ければ規格外
|
||
outOfSpecification = Utf8OutOfSpecification(byteChar[0], byteCharCount, true);
|
||
if (outOfSpecification)
|
||
{
|
||
break;
|
||
}
|
||
|
||
continue;
|
||
}
|
||
|
||
//どれにも当てはまらない
|
||
outOfSpecification = true;
|
||
break;
|
||
}
|
||
else
|
||
{
|
||
// 7bit文字
|
||
byteChar[0] = 0;
|
||
byteCharCount = 0;
|
||
}
|
||
}
|
||
|
||
return outOfSpecification;
|
||
}
|
||
|
||
private static bool Utf8OutOfSpecification(uint topByteChar, int byteCharCount, bool checkBig)
|
||
{
|
||
bool outOfSpecification = false;
|
||
|
||
//セカンドコード数が規格より多ければ規格外
|
||
if (topByteChar == 0b11000000)
|
||
{
|
||
if (checkBig == true)
|
||
{
|
||
if (byteCharCount > 2) outOfSpecification = true;
|
||
}
|
||
else
|
||
{
|
||
if (byteCharCount < 2) outOfSpecification = true;
|
||
}
|
||
}
|
||
else if (topByteChar == 0b11100000)
|
||
{
|
||
if (checkBig == true)
|
||
{
|
||
if (byteCharCount > 3) outOfSpecification = true;
|
||
}
|
||
else
|
||
{
|
||
if (byteCharCount < 3) outOfSpecification = true;
|
||
}
|
||
}
|
||
else if (topByteChar == 0b11110000)
|
||
{
|
||
if (checkBig == true)
|
||
{
|
||
if (byteCharCount > 4) outOfSpecification = true;
|
||
}
|
||
else
|
||
{
|
||
if (byteCharCount < 4) outOfSpecification = true;
|
||
}
|
||
}
|
||
|
||
return outOfSpecification;
|
||
}
|
||
|
||
private enum BYTECODE : byte { OneByteCode, TwoByteCode, KanaOneByte }
|
||
|
||
private static bool EUCJPEncodingJudgment(byte[] buffer, int sizeOfBuffer)
|
||
{
|
||
bool outOfSpecification = false;
|
||
|
||
BYTECODE beforeCode = BYTECODE.OneByteCode;
|
||
int byteCharCount = 0;
|
||
|
||
for (int i = 0; i < sizeOfBuffer; i++)
|
||
{
|
||
// 2バイトコード
|
||
if (0xA1 <= buffer[i] && buffer[i] <= 0xFE)
|
||
{
|
||
if (beforeCode == BYTECODE.KanaOneByte)
|
||
{
|
||
if (byteCharCount == 1)
|
||
{
|
||
byteCharCount = 2;
|
||
}
|
||
else
|
||
{
|
||
outOfSpecification = true;
|
||
break;
|
||
}
|
||
}
|
||
|
||
if (beforeCode == BYTECODE.TwoByteCode)
|
||
{
|
||
if (byteCharCount == 1)
|
||
byteCharCount = 2;
|
||
else if (byteCharCount == 2)
|
||
byteCharCount = 1;
|
||
}
|
||
|
||
beforeCode = BYTECODE.TwoByteCode;
|
||
}
|
||
// 1バイトコード
|
||
else if (buffer[i] <= 0x7F)
|
||
{
|
||
if (beforeCode == BYTECODE.TwoByteCode && byteCharCount == 1)
|
||
{
|
||
outOfSpecification = true;
|
||
break;
|
||
}
|
||
|
||
beforeCode = BYTECODE.OneByteCode;
|
||
byteCharCount = 1;
|
||
}
|
||
// 半角カタカナ2バイトコード
|
||
else if (buffer[i] == 0x8E && byteCharCount == 1)
|
||
{
|
||
beforeCode = BYTECODE.KanaOneByte;
|
||
byteCharCount = 1;
|
||
}
|
||
// あり得ない
|
||
else
|
||
{
|
||
outOfSpecification = true;
|
||
break;
|
||
}
|
||
}
|
||
|
||
return outOfSpecification;
|
||
}
|
||
|
||
/// <summary>クレンジング</summary>
|
||
/// <param name="fileText">文字列</param>
|
||
/// <param name="csvFilePath">csvファイルのパス</param>
|
||
/// <returns>特殊記号などを除去した文字列</returns>
|
||
///
|
||
public static string? CleansingText(string fileText, string csvFilePath)
|
||
{
|
||
string? result = null;
|
||
|
||
try
|
||
{
|
||
// CSVファイルから除外する特殊記号を読み込む
|
||
System.Text.Encoding.RegisterProvider(System.Text.CodePagesEncodingProvider.Instance);
|
||
List<string> specialCharacters = LoadSpecialCharacters(csvFilePath);
|
||
|
||
if (specialCharacters.Count > 0)
|
||
{
|
||
// 特殊記号を1つずつ削除
|
||
foreach (var specialChar in specialCharacters)
|
||
{
|
||
if (!string.IsNullOrEmpty(specialChar))
|
||
{
|
||
fileText = fileText.Replace(specialChar, string.Empty);
|
||
}
|
||
}
|
||
// 文字の統一処理(数字の半角化、アルファベット小文字化、ひらがなカタカナ漢字の全角化)
|
||
fileText = NormalizeText(fileText);
|
||
result = fileText; // 結果を返す
|
||
}
|
||
else
|
||
{
|
||
result = fileText; // 特殊記号が無ければそのまま返す
|
||
}
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
LogWriter.WriteLog($"クレンジング処理中にエラーが発生しました: {ex.Message}", LogWriter.LogLevel.ERROR);
|
||
Console.WriteLine($"エラーが発生しました: {ex.Message}");
|
||
result = fileText; // エラーが発生した場合は元の文字列を返す
|
||
}
|
||
|
||
return result;
|
||
}
|
||
|
||
/// <summary>
|
||
/// CSVファイルから特殊記号のリストを読み込む
|
||
/// </summary>
|
||
/// <param name="csvFilePath">CSVファイルのパス</param>
|
||
/// <returns>特殊記号のリスト</returns>
|
||
///
|
||
private static List<string> LoadSpecialCharacters(string csvFilePath)
|
||
{
|
||
List<string> specialCharacters = new List<string>();
|
||
|
||
try
|
||
{
|
||
// Shift-JISエンコーディングでファイルを開く
|
||
using (var reader = new StreamReader(csvFilePath, System.Text.Encoding.GetEncoding("shift_jis")))
|
||
{
|
||
// ファイルが空でないことを確認
|
||
if (!reader.EndOfStream)
|
||
{
|
||
var line = reader.ReadLine(); // 1行のみ読み込む
|
||
|
||
// lineがnullでないことを確認
|
||
if (line != null)
|
||
{
|
||
var trimmedLine = line.Trim();
|
||
|
||
if (!string.IsNullOrEmpty(trimmedLine))
|
||
{
|
||
// 行をカンマで分割して、列を取り出す
|
||
var columns = trimmedLine.Split(',');
|
||
|
||
// 列ごとにリストに追加
|
||
foreach (var column in columns)
|
||
{
|
||
specialCharacters.Add(column.Trim());
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
catch (Exception ex)
|
||
{
|
||
LogWriter.WriteLog($"CSVファイルの読み込みでエラーが発生しました: {ex.Message}", LogWriter.LogLevel.ERROR);
|
||
Console.WriteLine($"CSVファイルの読み込みエラー: {ex.Message}");
|
||
}
|
||
|
||
return specialCharacters;
|
||
}
|
||
|
||
/// <summary>
|
||
/// 文字列の正規化(数字半角化、アルファベット小文字化、ひらがなカタカナ漢字の全角化)
|
||
/// </summary>
|
||
/// <param name="inputText">入力文字列</param>
|
||
/// <returns>正規化された文字列</returns>
|
||
private static string NormalizeText(string inputText)
|
||
{
|
||
// 1. 全角化
|
||
string result = NormalizeKatakana(inputText);
|
||
|
||
// 2. ローマ数字をアラビア数字に変換
|
||
result = NormalizeRomanNumerals(result);
|
||
|
||
// 3. アルファベットと数字を半角化
|
||
result = NormalizeAlphabetAndNumbers(result);
|
||
|
||
return result;
|
||
}
|
||
|
||
/// <summary>
|
||
/// ひらがな、カタカナ、漢字を全角に変換する
|
||
/// </summary>
|
||
/// <param name="inputText">入力文字列</param>
|
||
/// <returns>全角に変換された文字列</returns>
|
||
private static string NormalizeKatakana(string inputText)
|
||
{
|
||
// 半角カタカナを全角カタカナに変換
|
||
string result = inputText;
|
||
result = result.Replace("ア", "ア")
|
||
.Replace("イ", "イ")
|
||
.Replace("ウ", "ウ")
|
||
.Replace("エ", "エ")
|
||
.Replace("オ", "オ")
|
||
.Replace("ァ", "ァ")
|
||
.Replace("ィ", "ィ")
|
||
.Replace("ゥ", "ゥ")
|
||
.Replace("ェ", "ェ")
|
||
.Replace("ォ", "ォ")
|
||
.Replace("カ", "カ")
|
||
.Replace("キ", "キ")
|
||
.Replace("ク", "ク")
|
||
.Replace("ケ", "ケ")
|
||
.Replace("コ", "コ")
|
||
.Replace("サ", "サ")
|
||
.Replace("シ", "シ")
|
||
.Replace("ス", "ス")
|
||
.Replace("セ", "セ")
|
||
.Replace("ソ", "ソ")
|
||
.Replace("タ", "タ")
|
||
.Replace("チ", "チ")
|
||
.Replace("ツ", "ツ")
|
||
.Replace("ッ", "ッ")
|
||
.Replace("テ", "テ")
|
||
.Replace("ト", "ト")
|
||
.Replace("ナ", "ナ")
|
||
.Replace("ニ", "ニ")
|
||
.Replace("ヌ", "ヌ")
|
||
.Replace("ネ", "ネ")
|
||
.Replace("ノ", "ノ")
|
||
.Replace("ハ", "ハ")
|
||
.Replace("ヒ", "ヒ")
|
||
.Replace("フ", "フ")
|
||
.Replace("ヘ", "ヘ")
|
||
.Replace("ホ", "ホ")
|
||
.Replace("マ", "マ")
|
||
.Replace("ミ", "ミ")
|
||
.Replace("ム", "ム")
|
||
.Replace("メ", "メ")
|
||
.Replace("モ", "モ")
|
||
.Replace("ヤ", "ヤ")
|
||
.Replace("ユ", "ユ")
|
||
.Replace("ヨ", "ヨ")
|
||
.Replace("ャ", "ャ")
|
||
.Replace("ュ", "ュ")
|
||
.Replace("ョ", "ョ")
|
||
.Replace("ラ", "ラ")
|
||
.Replace("リ", "リ")
|
||
.Replace("ル", "ル")
|
||
.Replace("レ", "レ")
|
||
.Replace("ロ", "ロ")
|
||
.Replace("ワ", "ワ")
|
||
.Replace("ヲ", "ヲ")
|
||
.Replace("ン", "ン")
|
||
.Replace("ー", "ー");
|
||
return result;
|
||
}
|
||
|
||
/// <summary>
|
||
/// ローマ数字をアラビア数字に変換
|
||
/// </summary>
|
||
/// <param name="inputText">入力文字列</param>
|
||
/// <returns>ローマ数字がアラビア数字に変換された文字列</returns>
|
||
private static string NormalizeRomanNumerals(string inputText)
|
||
{
|
||
// ローマ数字のパターンを正規表現でマッチさせる
|
||
string pattern = @"[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫⅬⅭⅮⅯⅰⅱⅲⅳⅴⅵⅶⅷⅸⅹⅺⅻⅼⅽⅾ]+";
|
||
|
||
// 正規表現でローマ数字部分を検索し、それらをアラビア数字に変換
|
||
return Regex.Replace(inputText, pattern, match => ConvertRomanToArabic(match.Value));
|
||
}
|
||
|
||
// ローマ数字をアラビア数字に変換
|
||
private static string ConvertRomanToArabic(string roman)
|
||
{
|
||
// ローマ数字の対応を定義
|
||
Dictionary<char, int> romanToArabicMap = new Dictionary<char, int>
|
||
{
|
||
{ 'Ⅰ', 1 }, { 'Ⅱ', 2 }, { 'Ⅲ', 3 }, { 'Ⅳ', 4 }, { 'Ⅴ', 5 },
|
||
{ 'Ⅵ', 6 }, { 'Ⅶ', 7 }, { 'Ⅷ', 8 }, { 'Ⅸ', 9 }, { 'Ⅹ', 10 },
|
||
{ 'Ⅺ', 11 }, { 'Ⅻ', 12 }, { 'Ⅼ', 50 }, { 'Ⅽ', 100 }, { 'Ⅾ', 500 }, { 'Ⅿ', 1000 },
|
||
{ 'ⅰ', 1 }, { 'ⅱ', 2 }, { 'ⅲ', 3 }, { 'ⅳ', 4 }, { 'ⅴ', 5 },
|
||
{ 'ⅵ', 6 }, { 'ⅶ', 7 }, { 'ⅷ', 8 }, { 'ⅸ', 9 }, { 'ⅹ', 10 },
|
||
{ 'ⅺ', 11 }, { 'ⅻ', 12 }, { 'ⅼ', 50 }, { 'ⅽ', 100 }, { 'ⅾ', 500 }
|
||
};
|
||
|
||
int total = 0;
|
||
int previousValue = 0;
|
||
|
||
// 文字列を逆順で処理して、引き算と足し算を判定
|
||
for (int i = roman.Length - 1; i >= 0; i--)
|
||
{
|
||
char currentChar = roman[i];
|
||
|
||
// ローマ数字に含まれる文字かチェック
|
||
if (romanToArabicMap.ContainsKey(currentChar))
|
||
{
|
||
int currentValue = romanToArabicMap[currentChar];
|
||
|
||
// 次の文字が現在の文字より大きい場合は引き算
|
||
if (currentValue < previousValue)
|
||
{
|
||
total -= currentValue;
|
||
}
|
||
else
|
||
{
|
||
total += currentValue;
|
||
}
|
||
|
||
// 現在の値を次の文字の判定に使用
|
||
previousValue = currentValue;
|
||
}
|
||
else
|
||
{
|
||
// ローマ数字に含まれない文字がある場合はスキップ
|
||
continue;
|
||
}
|
||
}
|
||
|
||
return total.ToString();
|
||
}
|
||
|
||
/// <summary>
|
||
/// アルファベットと数字を半角に変換する(アルファベットは小文字化)
|
||
/// </summary>
|
||
/// <param name="inputText">入力文字列</param>
|
||
/// <returns>半角小文字アルファベット、半角数字に変換された文字列</returns>
|
||
private static string NormalizeAlphabetAndNumbers(string inputText)
|
||
{
|
||
// アルファベットを小文字にし、半角に変換
|
||
string result = inputText.ToLower(); // 小文字化
|
||
result = ToHalfWidth(result); // 半角化
|
||
|
||
// 数字を半角に変換(これは既に半角化されている場合もあるので、念のため再度変換)
|
||
result = NormalizeNumbers(result); // 数字の半角化
|
||
|
||
return result;
|
||
}
|
||
|
||
/// <summary>
|
||
/// 数字を半角に変換する
|
||
/// </summary>
|
||
/// <param name="inputText">入力文字列</param>
|
||
/// <returns>半角数字に変換された文字列</returns>
|
||
private static string NormalizeNumbers(string inputText)
|
||
{
|
||
return inputText.Replace("0", "0")
|
||
.Replace("1", "1")
|
||
.Replace("2", "2")
|
||
.Replace("3", "3")
|
||
.Replace("4", "4")
|
||
.Replace("5", "5")
|
||
.Replace("6", "6")
|
||
.Replace("7", "7")
|
||
.Replace("8", "8")
|
||
.Replace("9", "9");
|
||
}
|
||
|
||
/// <summary>
|
||
/// 全角文字を半角に変換する(アルファベット・数字)
|
||
/// </summary>
|
||
/// <param name="inputText">入力文字列</param>
|
||
/// <returns>半角に変換された文字列</returns>
|
||
private static string ToHalfWidth(string inputText)
|
||
{
|
||
char[] chars = inputText.ToCharArray();
|
||
for (int i = 0; i < chars.Length; i++)
|
||
{
|
||
// アルファベット(小文字・大文字)を半角化
|
||
if (chars[i] >= 'A' && chars[i] <= 'Z')
|
||
{
|
||
chars[i] = (char)(chars[i] - 'A' + 'A'); // 大文字を半角
|
||
}
|
||
else if (chars[i] >= 'a' && chars[i] <= 'z')
|
||
{
|
||
chars[i] = (char)(chars[i] - 'a' + 'a'); // 小文字を半角
|
||
}
|
||
// 数字を半角に変換
|
||
else if (chars[i] >= '0' && chars[i] <= '9')
|
||
{
|
||
chars[i] = (char)(chars[i] - '0' + '0'); // 数字を半角
|
||
}
|
||
}
|
||
return new string(chars);
|
||
}
|
||
}
|
||
}
|