Fórum Como fazer um web crawler #569765
09/04/2009
0
Sink0
Curtir tópico
+ 0Posts
09/04/2009
Hudson Afonso
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.PostMethod;
public class Combo {
/**
* @param args
*/
public static void main(String[] args) {
try {
String card = "Torture";
String url = "http://www.essentialmagic.com//combos/search.asp";
HttpClient client = new HttpClient();
PostMethod method = new PostMethod(url);
List params = new ArrayList();
params.add(new NameValuePair("txtName", ""));
params.add(new NameValuePair("txtCreator", ""));
params.add(new NameValuePair("cnbtxtCard", card));
params.add(new NameValuePair("cnbhdnCard", "5"));
params.add(new NameValuePair("selDateCompare", "BETWEEN"));
params.add(new NameValuePair("txtDate", ""));
params.add(new NameValuePair("selFormat", "3"));
params.add(new NameValuePair("selThemes", "0"));
params.add(new NameValuePair("selRatingCompare", ">="));
params.add(new NameValuePair("selRatings", "0"));
params.add(new NameValuePair("selNumRatingsCompare", ">="));
params.add(new NameValuePair("selNumRatings", "-1"));
params.add(new NameValuePair("selRarities", "0"));
params.add(new NameValuePair("selInvalid", "0"));
params.add(new NameValuePair("chkColor0", "on"));
params.add(new NameValuePair("chkColor1", "on"));
params.add(new NameValuePair("chkColor2", "on"));
params.add(new NameValuePair("chkColor3", "on"));
params.add(new NameValuePair("chkColor4", "on"));
params.add(new NameValuePair("chkColor5", "on"));
params.add(new NameValuePair("selColorCompare", "OR"));
params.add(new NameValuePair("btnSearch", "Begin+Search"));
method.setRequestBody((NameValuePair[]) params.toArray(new NameValuePair[params.size()]));
client.executeMethod(method);
method = new PostMethod("http://www.essentialmagic.com/combos/SearchResults.asp");
client.executeMethod(method);
String page = method.getResponseBodyAsString();
//<div class="Label" style="font-size: 11pt; font-style: italic;">Grim Harvest</div>
Pattern comboLinkpattern = Pattern.compile("(<[^>]*?div[^>]*?(?:Label)[^>]*>)((?:.*?(?:<[ \r\t]*div[^>]*>?.*?(?:<.*?/.*?div.*?>)?)*)*)(<[^>]*?/[^>]*?div[^>]*?>)");
Matcher comboLinkmatcher = comboLinkpattern.matcher(page);
List comboList = new ArrayList();
List comboIndexList = new ArrayList();
while(comboLinkmatcher.find()){
Pattern comboStartLinkPattern = Pattern.compile("<[^>]*?div[^>]*?(?:Label)[^>]*>");
Pattern comboEndLinkPattern = Pattern.compile("</div>");
Matcher comboStartLinkmatcher = comboStartLinkPattern.matcher(page.substring(comboLinkmatcher.start(), comboLinkmatcher.end()));
comboStartLinkmatcher.find();
int start = comboStartLinkmatcher.end();
comboStartLinkmatcher = comboEndLinkPattern.matcher(page.substring(comboLinkmatcher.start(), comboLinkmatcher.end()));
comboStartLinkmatcher.find();
int end = comboStartLinkmatcher.start();
comboList.add(page.substring(comboLinkmatcher.start(), comboLinkmatcher.end()).substring(start,end));
comboIndexList.add(new Integer(comboLinkmatcher.start()));
//System.out.println(page.substring(comboLinkmatcher.start(), comboLinkmatcher.end()).substring(start,end));
}
Pattern cardLinkPattern = Pattern.compile("(<[^>]*?a[^>]*?(?:hidePopup)[^>]*>)((?:.*?(?:<[ \r\t]*a[^>]*>?.*?(?:<.*?/.*?a.*?>)?)*)*)(<[^>]*?/[^>]*?a[^>]*?>)");
Matcher cardLinkmatcher = cardLinkPattern.matcher(page);
int combo = 0;
while (cardLinkmatcher.find()) {
if((combo < comboIndexList.size()) && (combo==0 || cardLinkmatcher.start()>((Integer)comboIndexList.get(combo)).intValue())){
System.out.println("\n\n>>"+comboList.get(combo));
combo++;
}
Pattern cardStartLinkPattern = Pattern.compile("<[^>]*?a[^>]*?(?:hidePopup)[^>]*>");
Pattern cardEndLinkPattern = Pattern.compile("</a>");
Matcher cardStartLinkmatcher = cardStartLinkPattern.matcher(page.substring(cardLinkmatcher.start(), cardLinkmatcher.end()));
cardStartLinkmatcher.find();
int start = cardStartLinkmatcher.end();
cardStartLinkmatcher = cardEndLinkPattern.matcher(page.substring(cardLinkmatcher.start(), cardLinkmatcher.end()));
cardStartLinkmatcher.find();
int end = cardStartLinkmatcher.start();
System.out.println(page.substring(cardLinkmatcher.start(), cardLinkmatcher.end()).substring(start,end));
}
} catch (Exception e) {
e.printStackTrace();
}
}
}Gostei + 0
09/04/2009
Sink0
Gostei + 0
09/04/2009
Ricardo Staroski
Gostei + 0
09/04/2009
Hudson Afonso
Gostei + 0
09/04/2009
Sink0
Gostei + 0
09/04/2009
Sink0
Gostei + 0
09/04/2009
Mauricio Nunes