Wednesday, September 2, 2009

JAVA Regex

Regex to get all links from a html source
String patternString = "<a.*?>/a>"
Pattern pattern = Pattern.compile(patternString, Pattern.MULTILINE Pattern.CANON_EQ Pattern.DOTALL Pattern.CASE_INSENSITIVE Pattern.UNICODE_CASE);Matcher matcher = pattern.matcher(pageContent);
while (matcher.find()) {
String extractedUrl = matcher.group(); System.out.println(extractedUrl)
}
String hRefPattern = "href=['\"](.*?)['\"]";
// extracts the href link
link = m.group(1);

-----------------------------
Regex to get all RSS/ATOM links from a html source
String patternRSS = "<link.*rss\\+xml.*href.*['\"]((.*?))['\"]>";
Pattern pattern = Pattern.compile(patternRSS, Pattern.MULTILINE Pattern.CANON_EQ Pattern.CASE_INSENSITIVE Pattern.UNICODE_CASE);
Matcher matcherRSS = pattern.matcher(pageContent);
while (matcherRSS.find()) {
String rssUrl = matcherRSS.group(2);
System.out.println(rssUrl);
}
// likewise ...
String patternATOM = "<link.*atom\\+xml.*href.*['\"]((.*?))['\"]>";

------------------------------
Get links from a RSS/ATOM source
String patternRSSLinks = "<link>(.*?)</link>";
Pattern pattern = Pattern.compile(patternRSSLinks, Pattern.MULTILINE Pattern.CANON_EQ Pattern.CASE_INSENSITIVE Pattern.UNICODE_CASE);
Matcher matcherRSSLinks = pattern.matcher(rssXML);
while (matcherRSSLinks.find()) {
String link = matcherRSSLinks.group(1);
System.out.println(link)
}

---------------------------------
Determining javascript presence in a html link and getting's it's value
// 1. check for presence of opendocument, openwindow and window.open methods
// extract the relative value and form it's absolute urlString
patternString = "(?:opendocumentopenwindowwindow\\.open)\\s*\\(['\"]((.*?))['\"]";
Pattern pattern = Pattern.compile(patternString, Pattern.DOTALL Pattern.CASE_INSENSITIVE Pattern.UNICODE_CASE);
Matcher matcher = pattern.matcher(href);
if(matcher.find()) {
javascriptPresenceDetected = true;
javascriptHref = matcher.group(2);
}

// 2. if not caught by 1
patternString = "(?:javascript:).*?\\(['\"]((.*?))['\"]";
pattern = Pattern.compile(patternString, Pattern.DOTALL Pattern.CASE_INSENSITIVE Pattern.UNICODE_CASE);
matcher = pattern.matcher(href);
if(matcher.find()) {
javascriptPresenceDetected = true;
javascriptHref = matcher.group(2);
}

// 3. if not caught by 1 & 2
patternString = "(?:onclick).*?\\(['\"]((.*?))['\"]";
pattern = Pattern.compile(patternString, Pattern.DOTALL Pattern.CASE_INSENSITIVE Pattern.UNICODE_CASE);
matcher = pattern.matcher(href);
if(matcher.find()) {
javascriptPresenceDetected = true;
javascriptHref = matcher.group(2);
}

No comments:

Post a Comment