Webcrawler java Hoverkraft

He estado trasteando una forma de simular un navegador en Java. Hasta ahora he usado JMeter, que es tremendamente potente, configurable y para pruebas de carga es imprescindible. No obstante hay dos detalles que no me convencen: a veces uno quiere algo programático en lugar de declarativo, y segundo el JMeter es durillo de entender y configurar. Además, no siempre es necesario tener métricas exóticas o peticiones de Ajax, a veces sólo queremos acceder a algún servicio web o analizar una web para bajar ficheros o automatizar tareas.

Aunque hay un montón de soluciones disponibles, me he propuesto hacer un pequeño simulador de navegador (un webcrawler) en Java, que permita fácilmente y de forma sencilla implementar tareas. Le he puesto a la criatura Hoverkraft. Dejo por aquí el código fuente.

package net.krusher.hoverkraft;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Serializable;
import java.net.HttpCookie;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Scanner;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
/**
* Hoverkraft - Das Web Boot
* @author Axelei
*
*/
public class Hoverkraft implements Serializable {
/**
* 
*/
private static final long serialVersionUID = -4846381367781986634L;
public static final String USER_AGENT = " Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36";
public static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
public static final String ACCEPT_LANGUAGE = "es,en-US;q=0.8,en;q=0.6";
public static final int MAX_TRIES = 5;
private static final String LINE_FEED = "\r\n";
public enum Method {
GET, POST
}
private URL url;
private HttpURLConnection connection;
private String content;
private int code = -1;
private Method method;
private String referer;
private Map<String, String> postVars = new HashMap<String, String>();
private Map<String, HttpCookie> cookies = new HashMap<String, HttpCookie>();
private Map<String, File> uploads = new HashMap<String, File>();
public Document getXml() {
return Jsoup.parse(content);
}
public Hoverkraft() {
super();
}
/**
* Set sail to a destination
* @param url
* @throws MalformedURLException
*/
public void go(String url, Method method) throws MalformedURLException {
this.url = new URL(url);
this.method = method;
}
public void go(String url) throws MalformedURLException {
go(url, Method.GET);
}
public void disconnect() {
connection.disconnect();
}
public void setPostVars(Map<String, String> vars) {
this.postVars = vars;
}
public void setUploads(Map<String, File> uploads) {
this.uploads = uploads;
}
/**
* Executes the web petition
* @throws IOException 
*/
public void execute() throws IOException {
boolean redirect = false;
int tries = 0;
do {
connection = (HttpURLConnection) url.openConnection();
setProperties(connection);
connection.connect();
code = connection.getResponseCode();
// Redirecciones
if (code != HttpURLConnection.HTTP_OK) {
if (code == HttpURLConnection.HTTP_MOVED_TEMP
|| code == HttpURLConnection.HTTP_MOVED_PERM
|| code == HttpURLConnection.HTTP_SEE_OTHER
)
redirect = true;
}
if (redirect) {
go(connection.getHeaderField("Location"), method);
}
} while (redirect == true && tries++ < MAX_TRIES);
InputStream is = (InputStream) connection.getContent();
content = stream2string(is);
referer = url.toString();
Map<String, List<String>> headers = connection.getHeaderFields();
/**
* Obtener cookies
*/
if (headers.containsKey("Set-Cookie")) {
List<String> cookiesObtenidas = headers.get("Set-Cookie");
for (String cookie : cookiesObtenidas) {
List<HttpCookie> cookiesParseadas = HttpCookie.parse(cookie);
for (HttpCookie cookieParseada : cookiesParseadas) {
if (cookies.containsKey(cookieParseada.getName())) {
cookies.remove(cookieParseada.getName());
}
cookies.put(cookieParseada.getName(), cookieParseada);
}
}
}
}
private void setProperties(HttpURLConnection connection) throws IOException {
// Cabeceras
connection.setRequestProperty("user-agent", USER_AGENT);
connection.setRequestProperty("accept", ACCEPT);
connection.setRequestProperty("accept-language", ACCEPT_LANGUAGE);
if (referer != null) {
connection.setRequestProperty("referer", referer);
}
connection.setRequestMethod(method.toString());
// Cookies
for (HttpCookie cookie : cookies.values()) {
connection.setRequestProperty("Cookie", cookie.toString());
}
connection.setDoOutput(false);
// Variables Post y demás
if (method == Method.POST && !postVars.isEmpty() && uploads.isEmpty()) {
connection.setDoOutput(true);
StringBuffer urlParameters = new StringBuffer();
for (Entry<String, String> var : postVars.entrySet()) {
urlParameters.append(URLEncoder.encode(var.getKey(), "UTF-8") + "=" + URLEncoder.encode(var.getValue(), "UTF-8") + "&");
}
if (urlParameters.charAt(urlParameters.length() - 1) == '&') {
urlParameters.deleteCharAt(urlParameters.length() - 1);
}
postVars.clear();
connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded"); 
connection.setRequestProperty("charset", "utf-8");
connection.setRequestProperty("Content-Length", Integer.toString(urlParameters.toString().getBytes().length));
DataOutputStream wr = new DataOutputStream(connection.getOutputStream());
wr.writeBytes(urlParameters.toString());
wr.flush();
wr.close();
}
if (method == Method.POST && !uploads.isEmpty()) {
String boundary = "===" + System.currentTimeMillis() + "===";
connection.setUseCaches(false);
connection.setDoOutput(true);
connection.setDoInput(true);
connection.setRequestProperty("Content-Type", "multipart/form-data; boundary=" + boundary);
connection.setRequestProperty("charset", "UTF-8");
OutputStream outputStream = connection.getOutputStream();
PrintWriter writer = new PrintWriter(new OutputStreamWriter(outputStream, "UTF-8"), true);
for (Entry<String, String> var : postVars.entrySet()) {
writer.append("--" + boundary).append(LINE_FEED);
writer.append("Content-Disposition: form-data; name=\"" + var.getKey() + "\"").append(LINE_FEED);
writer.append("Content-Type: text/plain; charset=UTF-8").append(LINE_FEED);
writer.append(LINE_FEED);
writer.append(var.getValue()).append(LINE_FEED);
writer.flush();
}
for (Entry<String, File> fichero : uploads.entrySet()) {
String fileName = fichero.getValue().getName();
writer.append("--" + boundary).append(LINE_FEED);
writer.append("Content-Disposition: form-data; name=\"" + fichero.getKey() + "\"; filename=\"" + fileName + "\"").append(LINE_FEED);
writer.append("Content-Type: " + URLConnection.guessContentTypeFromName(fileName)).append(LINE_FEED);
writer.append("Content-Transfer-Encoding: binary").append(LINE_FEED);
writer.append(LINE_FEED);
writer.flush();
FileInputStream inputStream = new FileInputStream(fichero.getValue());
byte[] buffer = new byte[4096];
int bytesRead = -1;
while ((bytesRead = inputStream.read(buffer)) != -1) {
outputStream.write(buffer, 0, bytesRead);
}
outputStream.flush();
inputStream.close();
writer.append(LINE_FEED);
writer.flush();   
}
writer.append(LINE_FEED).flush();
writer.append("--" + boundary + "--").append(LINE_FEED);
writer.close();
postVars.clear();
uploads.clear();
}
}
/**
* Resets the browser
*/
public void reset() {
url = null;
connection = null;
content = null;
code = -1;
referer = null;
postVars.clear();
cookies.clear();
uploads.clear();
}
/**
* Get contents of last execution
* @return
*/
public String getContent() {
return content;
}
/**
* Gets HTTP code of last execution
* @return
*/
public int getCode() {
return code;
}
private static String stream2string(InputStream is) {
String salida = "";
Scanner scanner = new Scanner(is);
scanner.useDelimiter("\\A");
while (scanner.hasNext()) {
salida += scanner.next();
}
scanner.close();
return salida;
}
}

Este código depende de la librería jsoup, que naturalmente está disponible como software libre por ahí y funciona desde Maven perfectamente.

Por supuesto estaría encantado de poder leer cualquier mejora o crítica. ¡Comenta, comenta!

Deja una respuesta

Tu dirección de correo electrónico no será publicada. Los campos obligatorios están marcados con *

Este sitio usa Akismet para reducir el spam. Aprende cómo se procesan los datos de tus comentarios.