C#数据采集中可以用到的几个方法

本文由“前端小玖”整理的数据采集过程中可以要用到的一些方法。因为我采集的数据比较简单，所以没有用到框架。比较有名的两个框架 HtmlAgilityPack 和 Jumony，感兴趣的可以研究下。当然，火车头采集工具也很方便，不过要付费。下面是整理的代码：

/// <summary>

/// Html正则处理帮助类

/// </summary>

public class HtmlRegex

{

/// <summary>

/// 匹配所有Html标签

/// </summary>

const string HTMLALLTAG = @"<[^>]+>|</[^>]+>";

/// <summary>

/// 删除所有html标签

/// </summary>

/// <param name="content">原HTML代码</param>

/// <returns></returns>

public static string RemoveAllHtml(string content)

{

return Regex.Replace(content, HTMLALLTAG, "");

}

/// <summary>

/// 根据正则匹配获取指定内容

/// </summary>

/// <param name="regStr">正则</param>

/// <param name="content">原HTML代码</param>

/// <param name="hashtml">是否包含HTML标签</param>

/// <returns></returns>

public static string GetStrByRegex(string regStr, string content, bool hashtml = true)

{

string result = string.Empty;

Regex reg = new Regex(regStr);

Match mth = reg.Match(content);

if (mth.Success)

{

result = mth.Value;

if (!hashtml) result = HtmlRegex.RemoveAllHtml(result); //去除html标签

}

return result;

}

/// <summary>

/// 获取指定位置的html代码

/// </summary>

/// <param name="start">起始字符串</param>

/// <param name="end">结束字符串</param>

/// <param name="content">原HTML代码</param>

/// <param name="hasHtml">是否包含HTML标签</param>

/// <returns></returns>

public static string GetStrByRegex(string start, string end, string content, bool hasHtml = true)

{

string result = string.Empty;

string regStr = @"(?is)(" + start + ").*?(" + end + ")";

Regex reg = new Regex(regStr);

Match mth = reg.Match(content);

if (mth.Success)

{

result = mth.Value;

if (!hasHtml) result = HtmlRegex.RemoveAllHtml(result); //去除html标签

}

return result;

}

/// <summary>

/// 获取匹配的字符串列表

/// </summary>

/// <param name="regStr">正则</param>

/// <param name="content">原HTML代码</param>

/// <returns></returns>

public static List<string> GetStrListByRegex(string regStr, string content)

{

List<string> strList = null;

MatchCollection mc = null;

try

{

Regex reg = new Regex(regStr);

mc = reg.Matches(content);

if (mc.Count > 0)

{

strList = new List<string>();

for (int i = 0; i < mc.Count; i++)

{

strList.Add(mc[i].Value);

}

catch

{

strList = null;

}

return strList;

}

/// <summary>

/// 获取匹配的字符串列表

/// </summary>

/// <param name="start">起始字符串</param>

/// <param name="end">结束字符串</param>

/// <param name="content">原HTML代码</param>

/// <returns></returns>

public static List<string> GetStrListByRegex(string start, string end, string content)

{

List<string> strList = null;

MatchCollection mc = null;

string regStr = @"(?is)(" + start + ").*?(" + end + ")";

try

{

Regex reg = new Regex(regStr);

mc = reg.Matches(content);

if (mc.Count > 0)

{

strList = new List<string>();

for (int i = 0; i < mc.Count; i++)

{

strList.Add(mc[i].Value);

}

catch

{

strList = null;

}

return strList;

}

本文作者：前端小玖

查看更多关于C#数据采集中可以用到的几个方法的详细内容...

声明：本文来自网络，不代表【好得很程序员自学网】立场，转载请注明出处：http://haodehen.cn/did162541

更新时间：2022-12-31 阅读：97次