- 本文转载自:
- #region 利用com组件读取office
- /// <summary>
- /// 判断文件是否存在
- /// </summary>
- /// <param name="pFileName"></param>
- private void IsExists(string pFileName) {
- if (!File.Exists(pFileName)) {
- throw new ApplicationException("指定目录下的无该文件");
- }
- }
- //获得word文件的文本内容
- public string Doc2Text(string docFileName) {
- IsExists(docFileName);
- //实例化COM
- Word.ApplicationClass wordApp = new Word.ApplicationClass();
- object fileobj = docFileName;
- object nullobj = System.Reflection.Missing.Value;
- //打开指定文件(不同版本的COM参数个数有差异,一般而言除第一个外都用nullobj就行了)
- Word.Document doc = wordApp.Documents.Open(ref fileobj, ref nullobj, ref nullobj,
- ref nullobj, ref nullobj, ref nullobj,
- ref nullobj, ref nullobj, ref nullobj,
- ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj
- );
- //取得doc文件中的文本
- string outText = doc.Content.Text;
- //关闭文件
- doc.Close(ref nullobj, ref nullobj, ref nullobj);
- //关闭COM,关闭word程序
- wordApp.Quit(ref nullobj, ref nullobj, ref nullobj);
- GC.Collect();
- //返回
- return outText;
- }
- //获得excel文件的文本内容
- public string Xls2Text(string xlsFileName) {
- IsExists(xlsFileName);
- Excel.Application xlsApp = new Excel.ApplicationClass();
- object nullobj = System.Reflection.Missing.Value;
- //打开Excel文档
- Excel.Workbook excel = xlsApp.Workbooks.Open(xlsFileName, nullobj,
- nullobj, nullobj, nullobj,
- nullobj, nullobj, nullobj,
- nullobj, nullobj, nullobj,
- nullobj, nullobj, nullobj,
- nullobj);
- //遍历Excel工作表
- Excel.Worksheet ews = null;
- StringBuilder builder = new StringBuilder();
- try
- {
- for (int k = 1; k <= excel.Worksheets.Count; k++)
- {
- ews = (Excel.Worksheet)excel.Worksheets[k];
- //builder.Append(((Excel.Range)ews.UsedRange).Text);
- if (ews.UsedRange.Value2 != null)
- {
- for (int i = 1; i <= ews.UsedRange.Cells.Rows.Count; i++)
- {
- for (int j = 1; j <= ews.UsedRange.Cells.Columns.Count; j++)
- {
- if (((object[,])(ews.UsedRange.Value2))[i, j] != null)
- {
- builder.Append(((object[,])(ews.UsedRange.Value2))[i, j]).Append("|");
- }
- }
- }
- }
- }
- }
- catch (Exception ex)
- {
- throw ex;
- }
- finally
- {
- excel.Close(nullobj, nullobj, nullobj);
- xlsApp.Quit();
- GC.Collect();
- }
- return builder.ToString();
- }
- //获得PPT文件的文本内容
- public string Ppt2Text(string pptFileName) {
- IsExists(pptFileName);
- PowerPoint.Application pptApp = new PowerPoint.ApplicationClass();
- object nullobj = System.Reflection.Missing.Value;
- PowerPoint.Presentation ppt = pptApp.Presentations.Open(pptFileName,
- Microsoft.Office.Core.MsoTriState.msoTrue,
- Microsoft.Office.Core.MsoTriState.msoFalse,
- Microsoft.Office.Core.MsoTriState.msoFalse);
- StringBuilder builder = new StringBuilder();
- try
- {
- foreach (PowerPoint.Slide slide in ppt.Slides)
- {
- foreach (PowerPoint.Shape shape in slide.Shapes)
- {
- if (shape.TextFrame.HasText == Microsoft.Office.Core.MsoTriState.msoTrue)
- {
- builder.Append(shape.TextFrame.TextRange.Text);
- }
- }
- }
- }
- catch (Exception ex)
- {
- throw ex;
- }
- finally {
- ppt.Close();
- pptApp.Quit();
- GC.Collect();
- }
- return builder.ToString();
- }
- #endregion
最近研究了个全文搜索的,Lucene.net,很有名的开源组件(有Java版本)。其实谈不上研究,就是以前客户有个需要,要能搜索上传文件(如 word Excel Txt 等等),项目中这些附件都存在一个image字段中的,一直没有办法来搜索,本文就讲一下如何利用Lucene.net对附件做搜索功能,并且利用com 组件来读取office内容。介绍一下Lucene.net的使用,使用了Lucene.Net.dll2.1 Highlighter.Net.dll 2.0(高亮) Lucene.Net.Analysis.Cn.dll 1.3(划词引擎):1 添加索引
- Code
- /// <summary>
- /// 添加索引
- /// </summary>
- /// <param name="file">索引实体Files</param>
- public void AddIndex(Files file) {
- IndexWriter writer;
- if (IndexReader.IndexExists(GetIndexPath)) {
- //非第一次递加
- writer = new IndexWriter(GetIndexPath, this.Analyzer, false);
- }
- else {
- //第一次创建
- writer = new IndexWriter(GetIndexPath, this.Analyzer, true);
- }
- Document doc = new Document();
- doc.Add(new Field("FileId", file.ID, Field.Store.YES, Field.Index.UN_TOKENIZED));//Field.Index.UN_TOKENIZED 类似把这字段作为主键
- doc.Add(new Field("Title", file.Title, Field.Store.YES, Field.Index.TOKENIZED));
- switch (file.FileType) {
- case FileType.Txt:
- doc.Add(new Field("File", new StreamReader(file.Stream, System.Text.Encoding.Default)));
- break;
- case FileType.Word:
- doc.Add(new Field("File", Doc2Text(file.FileName), Field.Store.YES, Field.Index.TOKENIZED));
- break;
- case FileType.Excel:
- doc.Add(new Field("File", Xls2Text(file.FileName), Field.Store.YES, Field.Index.TOKENIZED));
- break;
- case FileType.Ppt:
- doc.Add(new Field("File", Ppt2Text(file.FileName), Field.Store.YES, Field.Index.TOKENIZED));
- break;
- case FileType.Mht:
- doc.Add(new Field("File", Doc2Text(file.FileName), Field.Store.YES, Field.Index.TOKENIZED));
- break;
- case FileType.Htm:
- doc.Add(new Field("File", new StreamReader(file.Stream, System.Text.Encoding.Default)));
- break;
- default:
- break;
- }
- writer.AddDocument(doc);
- writer.Optimize();
- writer.Close();
- }
其中,把id 的index设为Field.Index.UN_TOKENIZED,差不多就是id做为主键,后面删除索引的时候,直接删除这个id就行
switch (file.FileType)就是根据附件类型,解析读取内容
2 搜索
- Code
- /// <summary>
- /// 搜索
- /// </summary>
- /// <param name="pSearchStr">查询字符</param>
- /// <returns>返回结果集</returns>
- public DataTable Search(string pSearchStr) {
- if (!string.IsNullOrEmpty(pSearchStr)) {
- IndexSearcher searcher = new IndexSearcher(this.GetIndexPath);
- //单字段搜索
- //QueryParser parser = new QueryParser("title", this.Analyzer);
- //Query query = parser.Parse(this.TextBox2.Text.Trim());
- //多字段搜索
- Query query = MultiFieldQueryParser.Parse(new string[] { pSearchStr, pSearchStr } , new string[] { "Title", "File" }, this.Analyzer);
- Hits h = searcher.Search(query);
- Document doc;
- //高亮显示
- SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color=/"red/">", "</font>");
- Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));
- //关键内容显示大小设置
- //highlighter.SetTextFragmenter(new SimpleFragmenter(400));
- DataTable dt = new DataTable();
- dt.Columns.Add("Id");//序号
- dt.Columns.Add("FileId");//记录ID
- dt.Columns.Add("Title");//标题
- for (int i = 0; i < h.Length(); i++) {
- doc = h.Doc(i);
- #region 下载
- //try {
- // //string strFile=HttpUtility.UrlEncode( myTable.Rows[0]["FileName"].ToString(), System.Text.Encoding.GetEncoding("GB2312")).Replace("+"," ");
- // string strFile = HttpUtility.UrlEncode(doc.GetField("title").StringValue(), System.Text.Encoding.UTF8);
- // Response.AddHeader("Content-Disposition", "attachment;filename=" + strFile);
- // Response.ContentType = ("application/unknown");
- // byte[] myByte = doc.GetField("file").BinaryValue();
- // Response.BinaryWrite(myByte);
- // Response.End();
- //}
- //catch { }
- #endregion
- string title = doc.Get("Title");
- //取出高亮显示内容
- TokenStream tokenStream = (this.Analyzer).TokenStream("Title", new StringReader(title));
- string newTitle = highlighter.GetBestFragments(tokenStream, title, 5, "");
- if (!string.IsNullOrEmpty(newTitle)) {
- title = newTitle;
- }
- this.AddRow(dt, i + 1, doc.Get("FileId"), title);
- }
- searcher.Close();
- return dt;
- }
- return null;
- }
现在只对标题(title)和内容(file)做了索引,所以只对这两个字段进行搜索. 最后,返回一个DataTable,包括FileID(记录ID,以便下载附件)和Title(标题). 其中对搜索结果使用了高亮显示Highlighter.
3 删除索引
- /// <summary>
- /// 删除索引
- /// </summary>
- /// <param name="pID"></param>
- public void Delete(string pID) {
- IndexReader reader = IndexReader.Open(GetIndexPath);
- Term aTerm = new Term("FileId", pID);
- reader.DeleteDocuments(aTerm);
- reader.Close();//必须,真正删除
- }
先创建个Term,然后用IndexReader删除
4 其他一些辅助属性
- #region 属性
- string INDEX_STORE_PATH = "index";
- /// <summary>
- /// 获取/设置index目录
- /// </summary>
- public string IndexPath {
- get {
- return INDEX_STORE_PATH;
- }
- set {
- INDEX_STORE_PATH = value;
- }
- }
- /// <summary>
- /// 换成物理地址
- /// </summary>
- private string GetIndexPath {
- get {
- return HttpContext.Current.Server.MapPath(INDEX_STORE_PATH);
- }
- }
- Analyzer _analyzer = new ChineseAnalyzer();
- /// <summary>
- /// 获取/设置分析器
- /// </summary>
- public Analyzer Analyzer {
- get {
- return _analyzer;
- }
- set {
- _analyzer = value;
- }
- }
- #endregion
5 通过com组件读取office文档内容
- #region 利用com组件读取office
- /// <summary>
- /// 判断文件是否存在
- /// </summary>
- /// <param name="pFileName"></param>
- private void IsExists(string pFileName) {
- if (!File.Exists(pFileName)) {
- throw new ApplicationException("指定目录下的无该文件");
- }
- }
- //获得word文件的文本内容
- public string Doc2Text(string docFileName) {
- IsExists(docFileName);
- //实例化COM
- Word.ApplicationClass wordApp = new Word.ApplicationClass();
- object fileobj = docFileName;
- object nullobj = System.Reflection.Missing.Value;
- //打开指定文件(不同版本的COM参数个数有差异,一般而言除第一个外都用nullobj就行了)
- Word.Document doc = wordApp.Documents.Open(ref fileobj, ref nullobj, ref nullobj,
- ref nullobj, ref nullobj, ref nullobj,
- ref nullobj, ref nullobj, ref nullobj,
- ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj
- );
- //取得doc文件中的文本
- string outText = doc.Content.Text;
- //关闭文件
- doc.Close(ref nullobj, ref nullobj, ref nullobj);
- //关闭COM,关闭word程序
- wordApp.Quit(ref nullobj, ref nullobj, ref nullobj);
- GC.Collect();
- //返回
- return outText;
- }
- //获得excel文件的文本内容
- public string Xls2Text(string xlsFileName) {
- IsExists(xlsFileName);
- Excel.Application xlsApp = new Excel.ApplicationClass();
- object nullobj = System.Reflection.Missing.Value;
- //打开Excel文档
- Excel.Workbook excel = xlsApp.Workbooks.Open(xlsFileName, nullobj,
- nullobj, nullobj, nullobj,
- nullobj, nullobj, nullobj,
- nullobj, nullobj, nullobj,
- nullobj, nullobj, nullobj,
- nullobj);
- //遍历Excel工作表
- Excel.Worksheet ews = null;
- StringBuilder builder = new StringBuilder();
- try
- {
- for (int k = 1; k <= excel.Worksheets.Count; k++)
- {
- ews = (Excel.Worksheet)excel.Worksheets[k];
- //builder.Append(((Excel.Range)ews.UsedRange).Text);
- if (ews.UsedRange.Value2 != null)
- {
- for (int i = 1; i <= ews.UsedRange.Cells.Rows.Count; i++)
- {
- for (int j = 1; j <= ews.UsedRange.Cells.Columns.Count; j++)
- {
- if (((object[,])(ews.UsedRange.Value2))[i, j] != null)
- {
- builder.Append(((object[,])(ews.UsedRange.Value2))[i, j]).Append("|");
- }
- }
- }
- }
- }
- }
- catch (Exception ex)
- {
- throw ex;
- }
- finally
- {
- excel.Close(nullobj, nullobj, nullobj);
- xlsApp.Quit();
- GC.Collect();
- }
- return builder.ToString();
- }
- //获得PPT文件的文本内容
- public string Ppt2Text(string pptFileName) {
- IsExists(pptFileName);
- PowerPoint.Application pptApp = new PowerPoint.ApplicationClass();
- object nullobj = System.Reflection.Missing.Value;
- PowerPoint.Presentation ppt = pptApp.Presentations.Open(pptFileName,
- Microsoft.Office.Core.MsoTriState.msoTrue,
- Microsoft.Office.Core.MsoTriState.msoFalse,
- Microsoft.Office.Core.MsoTriState.msoFalse);
- StringBuilder builder = new StringBuilder();
- try
- {
- foreach (PowerPoint.Slide slide in ppt.Slides)
- {
- foreach (PowerPoint.Shape shape in slide.Shapes)
- {
- if (shape.TextFrame.HasText == Microsoft.Office.Core.MsoTriState.msoTrue)
- {
- builder.Append(shape.TextFrame.TextRange.Text);
- }
- }
- }
- }
- catch (Exception ex)
- {
- throw ex;
- }
- finally {
- ppt.Close();
- pptApp.Quit();
- GC.Collect();
- }
- return builder.ToString();
- }
- #endregion
此内容写得比较详细,为转贴吧,我在自已用的时候已民写成vb.net
格式,并优化一下,可以相关文章里有代码