清除word格式的C#代码
static void Main(string[] args) { if (args.Length == 0 || String.IsNullOrEmpty(args[0])) { Console.WriteLine("No filename provided."); return; } string filepath = args[0]; if (Path.GetFileName(filepath) == args[0]) { filepath = Path.Combine(Environment.CurrentDirectory, filepath); } if (!File.Exists(args[0])) { Console.WriteLine("File doesn't exist."); } string html = File.ReadAllText(filepath); Console.WriteLine("input html is " html.Length " chars"); html = CleanWordHtml(html); html = FixEntities(html); filepath = Path.GetFileNameWithoutExtension(filepath) ".modified.htm"; File.WriteAllText(filepath, html); Console.WriteLine("cleaned html is " html.Length " chars"); } static string CleanWordHtml(string html) { StringCollection sc = new StringCollection(); // get rid of unnecessary tag spans (comments and title) sc.Add(@"<!--(w|W) ?-->"); sc.Add(@"<title>(w|W) ?</title>"); // Get rid of classes and styles sc.Add(@"s?class=w "); sc.Add(@"s style='[^'] '"); // Get rid of unnecessary tags sc.Add( @"<(meta|link|/?o:|/?style|/?div|/?std|/?head|/?html|body|/?body|/?span|![)[^>]*?>"); // Get rid of empty paragraph tags sc.Add(@"(<[^>] >) (</w >) "); // remove bizarre v: element attached to <img> tag sc.Add(@"s v:w =""[^""] """); // remove extra lines sc.Add(@"( ){2,}"); foreach (string s in sc) { html = Regex.Replace(html, s, "", RegexOptions.IgnoreCase); } return html; } static string FixEntities(string html) { NamueCollection nvc = new NamueCollection(); nvc.Add("“", "“"); nvc.Add("”", "”"); nvc.Add("?", "—"); foreach (string key in nvc.Keys) { html = html.Replace(key, nvc[key]); } return html; }