清除word格式的C#代码

13年前
static void Main(string[] args)  {      if (args.Length == 0 || String.IsNullOrEmpty(args[0]))      {          Console.WriteLine("No filename provided.");          return;      }         string filepath = args[0];      if (Path.GetFileName(filepath) == args[0])      {          filepath = Path.Combine(Environment.CurrentDirectory, filepath);      }      if (!File.Exists(args[0]))      {          Console.WriteLine("File doesn't exist.");      }         string html = File.ReadAllText(filepath);      Console.WriteLine("input html is "   html.Length   " chars");      html = CleanWordHtml(html);      html = FixEntities(html);                 filepath = Path.GetFileNameWithoutExtension(filepath)   ".modified.htm";                 File.WriteAllText(filepath, html);      Console.WriteLine("cleaned html is "   html.Length   " chars");  }     static string CleanWordHtml(string html)  {      StringCollection sc = new StringCollection();      // get rid of unnecessary tag spans (comments and title)      sc.Add(@"<!--(w|W) ?-->");      sc.Add(@"<title>(w|W) ?</title>");      // Get rid of classes and styles      sc.Add(@"s?class=w ");      sc.Add(@"s style='[^'] '");      // Get rid of unnecessary tags      sc.Add(      @"<(meta|link|/?o:|/?style|/?div|/?std|/?head|/?html|body|/?body|/?span|![)[^>]*?>");      // Get rid of empty paragraph tags      sc.Add(@"(<[^>] >)  (</w >) ");      // remove bizarre v: element attached to <img> tag      sc.Add(@"s v:w =""[^""] """);      // remove extra lines      sc.Add(@"(     ){2,}");      foreach (string s in sc)      {          html = Regex.Replace(html, s, "", RegexOptions.IgnoreCase);      }      return html;  }     static string FixEntities(string html)  {      NamueCollection nvc = new NamueCollection();      nvc.Add("“", "“");      nvc.Add("”", "”");      nvc.Add("?", "—");      foreach (string key in nvc.Keys)      {          html = html.Replace(key, nvc[key]);      }      return html;  }