Webcrawler java Hoverkraft

He estado trasteando una forma de simular un navegador en Java. Hasta ahora he usado JMeter, que es tremendamente potente, configurable y para pruebas de carga es imprescindible. No obstante hay dos detalles que no me convencen: a veces uno quiere algo programático en lugar de declarativo, y segundo el JMeter es durillo de entender y configurar. Además, no siempre es necesario tener métricas exóticas o peticiones de Ajax, a veces sólo queremos acceder a algún servicio web o analizar una web para bajar ficheros o automatizar tareas.

Aunque hay un montón de soluciones disponibles, me he propuesto hacer un pequeño simulador de navegador (un webcrawler) en Java, que permita fácilmente y de forma sencilla implementar tareas. Le he puesto a la criatura Hoverkraft. Dejo por aquí el código fuente.

package net.krusher.hoverkraft;

import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Serializable;
import java.net.HttpCookie;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Scanner;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

/**
 * Hoverkraft - Das Web Boot
 * @author Axelei
 *
 */
public class Hoverkraft implements Serializable {
	
	/**
	 * 
	 */
	private static final long serialVersionUID = -4846381367781986634L;
	public static final String USER_AGENT = " Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36";
	public static final String ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
	public static final String ACCEPT_LANGUAGE = "es,en-US;q=0.8,en;q=0.6";
	public static final int MAX_TRIES = 5;
	private static final String LINE_FEED = "\r\n";
	
	public enum Method {
		GET, POST
	}
	
	private URL url;
	private HttpURLConnection connection;
	private String content;
	private int code = -1;
	private Method method;
	private String referer;
	private Map<String, String> postVars = new HashMap<String, String>();
	private Map<String, HttpCookie> cookies = new HashMap<String, HttpCookie>();
	private Map<String, File> uploads = new HashMap<String, File>();
	
	public Document getXml() {
		return Jsoup.parse(content);
	}
	
	public Hoverkraft() {
		super();
	}

	/**
	 * Set sail to a destination
	 * @param url
	 * @throws MalformedURLException
	 */
	public void go(String url, Method method) throws MalformedURLException {
		this.url = new URL(url);
		this.method = method;
	}
	
	public void go(String url) throws MalformedURLException {
		go(url, Method.GET);
	}
	
	public void disconnect() {
		connection.disconnect();
	}
	
	public void setPostVars(Map<String, String> vars) {
		this.postVars = vars;
	}
	
	public void setUploads(Map<String, File> uploads) {
		this.uploads = uploads;
	}
	
	/**
	 * Executes the web petition
	 * @throws IOException 
	 */
	public void execute() throws IOException {
		
		boolean redirect = false;
		int tries = 0;
		
		do {
			connection = (HttpURLConnection) url.openConnection();
			setProperties(connection);
			
			connection.connect();
			
			code = connection.getResponseCode();
			
			// Redirecciones

			if (code != HttpURLConnection.HTTP_OK) {
				if (code == HttpURLConnection.HTTP_MOVED_TEMP
					|| code == HttpURLConnection.HTTP_MOVED_PERM
					|| code == HttpURLConnection.HTTP_SEE_OTHER
					)
				redirect = true;
			}
		 
			if (redirect) {
				go(connection.getHeaderField("Location"), method);
			}
		} while (redirect == true && tries++ < MAX_TRIES);
		
		InputStream is = (InputStream) connection.getContent();
		content = stream2string(is);
		
		referer = url.toString();

		Map<String, List<String>> headers = connection.getHeaderFields();
		
		/**
		 * Obtener cookies
		 */
		if (headers.containsKey("Set-Cookie")) {
			List<String> cookiesObtenidas = headers.get("Set-Cookie");
			
			for (String cookie : cookiesObtenidas) {
				List<HttpCookie> cookiesParseadas = HttpCookie.parse(cookie);
				for (HttpCookie cookieParseada : cookiesParseadas) {
					
					if (cookies.containsKey(cookieParseada.getName())) {
						cookies.remove(cookieParseada.getName());
					}
					cookies.put(cookieParseada.getName(), cookieParseada);
				}
			}
		}
	}
	
	private void setProperties(HttpURLConnection connection) throws IOException {
		
		// Cabeceras
		connection.setRequestProperty("user-agent", USER_AGENT);
		connection.setRequestProperty("accept", ACCEPT);
		connection.setRequestProperty("accept-language", ACCEPT_LANGUAGE);
		if (referer != null) {
			connection.setRequestProperty("referer", referer);
		}
		connection.setRequestMethod(method.toString());
		
		// Cookies
		for (HttpCookie cookie : cookies.values()) {
			connection.setRequestProperty("Cookie", cookie.toString());
		}
		
		connection.setDoOutput(false);
		
		// Variables Post y demás
		if (method == Method.POST && !postVars.isEmpty() && uploads.isEmpty()) {
			
			connection.setDoOutput(true);
			StringBuffer urlParameters = new StringBuffer();
			
			for (Entry<String, String> var : postVars.entrySet()) {
				urlParameters.append(URLEncoder.encode(var.getKey(), "UTF-8") + "=" + URLEncoder.encode(var.getValue(), "UTF-8") + "&");
			}
			
			if (urlParameters.charAt(urlParameters.length() - 1) == '&') {
				urlParameters.deleteCharAt(urlParameters.length() - 1);
			}
			postVars.clear();
			
			connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded"); 
			connection.setRequestProperty("charset", "utf-8");
			connection.setRequestProperty("Content-Length", Integer.toString(urlParameters.toString().getBytes().length));
			
			DataOutputStream wr = new DataOutputStream(connection.getOutputStream());
			wr.writeBytes(urlParameters.toString());
			wr.flush();
			wr.close();
		}
		
		if (method == Method.POST && !uploads.isEmpty()) {

			String boundary = "===" + System.currentTimeMillis() + "===";

			connection.setUseCaches(false);
			connection.setDoOutput(true);
			connection.setDoInput(true);

			connection.setRequestProperty("Content-Type", "multipart/form-data; boundary=" + boundary);
			connection.setRequestProperty("charset", "UTF-8");
			
			OutputStream outputStream = connection.getOutputStream();
			PrintWriter writer = new PrintWriter(new OutputStreamWriter(outputStream, "UTF-8"), true);

			for (Entry<String, String> var : postVars.entrySet()) {

				writer.append("--" + boundary).append(LINE_FEED);
		        writer.append("Content-Disposition: form-data; name=\"" + var.getKey() + "\"").append(LINE_FEED);
		        writer.append("Content-Type: text/plain; charset=UTF-8").append(LINE_FEED);
		        writer.append(LINE_FEED);
		        writer.append(var.getValue()).append(LINE_FEED);
		        writer.flush();
			}
			
			for (Entry<String, File> fichero : uploads.entrySet()) {
		        String fileName = fichero.getValue().getName();
		        writer.append("--" + boundary).append(LINE_FEED);
		        writer.append("Content-Disposition: form-data; name=\"" + fichero.getKey() + "\"; filename=\"" + fileName + "\"").append(LINE_FEED);
		        writer.append("Content-Type: " + URLConnection.guessContentTypeFromName(fileName)).append(LINE_FEED);
		        writer.append("Content-Transfer-Encoding: binary").append(LINE_FEED);
		        writer.append(LINE_FEED);
		        writer.flush();
		        
		        FileInputStream inputStream = new FileInputStream(fichero.getValue());
		        byte[] buffer = new byte[4096];
		        int bytesRead = -1;
		        while ((bytesRead = inputStream.read(buffer)) != -1) {
		            outputStream.write(buffer, 0, bytesRead);
		        }
		        outputStream.flush();
		        inputStream.close();
		         
		        writer.append(LINE_FEED);
		        writer.flush();   
			}
			
			writer.append(LINE_FEED).flush();
	        writer.append("--" + boundary + "--").append(LINE_FEED);
	        writer.close();

			postVars.clear();
			uploads.clear();
			
		}

		
	}
	
	/**
	 * Resets the browser
	 */
	public void reset() {
		url = null;
		connection = null;
		content = null;
		code = -1;
		referer = null;
		postVars.clear();
		cookies.clear();
		uploads.clear();
	}
	
	/**
	 * Get contents of last execution
	 * @return
	 */
	public String getContent() {
		return content;
	}
	
	/**
	 * Gets HTTP code of last execution
	 * @return
	 */
	public int getCode() {
		return code;
	}
	
	private static String stream2string(InputStream is) {
		String salida = "";
		Scanner scanner = new Scanner(is);
		scanner.useDelimiter("\\A");
		while (scanner.hasNext()) {
			salida += scanner.next();
		}
		scanner.close();
		return salida;
	}
}

Este código depende de la librería jsoup, que naturalmente está disponible como software libre por ahí y funciona desde Maven perfectamente.

Por supuesto estaría encantado de poder leer cualquier mejora o crítica. ¡Comenta, comenta!

Deja una respuesta

Tu dirección de correo electrónico no será publicada. Los campos obligatorios están marcados con *

Este sitio usa Akismet para reducir el spam. Aprende cómo se procesan los datos de tus comentarios.