Added the GathererCrawler project (used for generating mtg-cards-data.txt)

This commit is contained in:
North 2011-07-23 19:18:37 +03:00
parent d72a619530
commit 929d982e38
8 changed files with 750 additions and 0 deletions

View file

@ -0,0 +1,25 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>ro.trp</groupId>
<artifactId>GathererCrawler</artifactId>
<version>1.0</version>
<packaging>jar</packaging>
<name>GathererCrawler</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.5.2</version>
</dependency>
</dependencies>
</project>

View file

@ -0,0 +1,197 @@
package north.gatherercrawler;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
*
* @author North
*/
public class Card implements Comparable<Card> {
private Integer multiverseId;
private String name;
private List<String> manaCost;
private Integer convertedManaCost;
private String types;
private List<String> cardText;
private String flavorText;
private String powerToughness;
private String expansion;
private String rarity;
private String cardNumber;
private String artist;
public Card(Integer multiverseId) {
this.multiverseId = multiverseId;
}
public Card(String card) {
String[] split = card.split("\\|",13);
if (split[0].length() > 0) {
multiverseId = Integer.parseInt(split[0]);
}
if (split[1].length() > 0) {
name = split[1];
}
manaCost = new ArrayList<String>();
manaCost.addAll(Arrays.asList(split[2].split("\\$")));
if (split[3].length() > 0) {
convertedManaCost = Integer.parseInt(split[3]);
}
if (split[4].length() > 0) {
types = split[4];
}
cardText = new ArrayList<String>();
cardText.addAll(Arrays.asList(split[5].split("\\$")));
if (split[6].length() > 0) {
flavorText = split[6];
}
if (split[7].length() > 0) {
powerToughness = split[7];
}
if (split[8].length() > 0) {
expansion = split[8];
}
if (split[9].length() > 0) {
rarity = split[9];
}
if (split[10].length() > 0) {
cardNumber = split[10];
}
if (split[11].length() > 0) {
artist = split[11];
}
}
public String getArtist() {
return artist;
}
public void setArtist(String artist) {
this.artist = artist;
}
public String getCardNumber() {
return cardNumber;
}
public void setCardNumber(String cardNumber) {
this.cardNumber = cardNumber;
}
public List<String> getCardText() {
return cardText;
}
public void setCardText(List<String> cardText) {
this.cardText = cardText;
}
public Integer getConvertedManaCost() {
return convertedManaCost;
}
public void setConvertedManaCost(Integer convertedManaCost) {
this.convertedManaCost = convertedManaCost;
}
public String getExpansion() {
return expansion;
}
public void setExpansion(String expansion) {
this.expansion = expansion;
}
public String getFlavorText() {
return flavorText;
}
public void setFlavorText(String flavorText) {
this.flavorText = flavorText;
}
public List<String> getManaCost() {
return manaCost;
}
public void setManaCost(List<String> manaCost) {
this.manaCost = manaCost;
}
public Integer getMultiverseId() {
return multiverseId;
}
public void setMultiverseId(Integer multiverseId) {
this.multiverseId = multiverseId;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getPowerToughness() {
return powerToughness;
}
public void setPowerToughness(String powerToughness) {
this.powerToughness = powerToughness;
}
public String getRarity() {
return rarity;
}
public void setRarity(String rarity) {
this.rarity = rarity;
}
public String getTypes() {
return types;
}
public void setTypes(String types) {
this.types = types;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(multiverseId).append("|");
sb.append(name).append("|");
for (int i = 0; i < manaCost.size(); i++) {
sb.append(manaCost.get(i));
if (i < manaCost.size() - 1) {
sb.append("$");
}
}
sb.append("|");
sb.append(convertedManaCost != null ? convertedManaCost : "").append("|");
sb.append(types).append("|");
for (int i = 0; i < cardText.size(); i++) {
sb.append(cardText.get(i));
if (i < cardText.size() - 1) {
sb.append("$");
}
}
sb.append("|");
sb.append(flavorText != null ? flavorText : "").append("|");
sb.append(powerToughness != null ? powerToughness : "").append("|");
sb.append(expansion).append("|");
sb.append(rarity != null ? rarity : "").append("|");
sb.append(cardNumber != null ? cardNumber : "").append("|");
sb.append(artist != null ? artist : "");
return sb.toString();
}
public int compareTo(Card o) {
return this.multiverseId.compareTo(o.getMultiverseId());
}
}

View file

@ -0,0 +1,135 @@
package north.gatherercrawler;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* @author robert.biter
*/
public class CardParser extends Thread {
private boolean parseCard(Integer multiverseId) {
String url = "http://gatherer.wizards.com/Pages/Card/Details.aspx?multiverseid=" + multiverseId;
Card card = new Card(multiverseId);
Document doc = null;
int retries = 30;
boolean done = false;
while (retries > 0 && !done) {
try {
Connection connection = Jsoup.connect(url);
connection.timeout(20000);
doc = connection.get();
} catch (IOException ex) {
}
done = true;
}
if (!done) {
System.out.println("Card get exception: " + multiverseId);
return false;
}
try {
Elements select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow .value");
if (!select.isEmpty()) {
card.setName(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_manaRow .value img");
List<String> manaCost = new ArrayList<String>();
if (!select.isEmpty()) {
for (Element element : select) {
manaCost.add(element.attr("src").replace("/Handlers/Image.ashx?size=medium&name=", "").replace("&type=symbol", "").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", ""));
}
}
card.setManaCost(manaCost);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_cmcRow .value");
if (!select.isEmpty()) {
card.setConvertedManaCost(Integer.parseInt(select.get(0).text().trim()));
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_typeRow .value");
if (!select.isEmpty()) {
card.setTypes(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_textRow .value .cardtextbox");
List<String> cardText = new ArrayList<String>();
if (!select.isEmpty()) {
for (Element element : select) {
cardText.add(element.html().trim().replace("<img src=\"/Handlers/Image.ashx?size=small&amp;name=", "{").replace("&amp;type=symbol", "}").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", "").replace("\n", ""));
}
}
card.setCardText(cardText);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_FlavorText .cardtextbox i");
if (!select.isEmpty()) {
card.setFlavorText(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow .value");
if (!select.isEmpty()) {
card.setPowerToughness(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_currentSetSymbol a");
if (!select.isEmpty()) {
card.setExpansion(select.get(1).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_rarityRow .value span");
if (!select.isEmpty()) {
card.setRarity(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_otherSetsValue a");
List<Integer> otherSets = new ArrayList<Integer>();
if (!select.isEmpty()) {
for (Element element : select) {
otherSets.add(Integer.parseInt(element.attr("href").replace("Details.aspx?multiverseid=", "")));
}
}
// card.setOtherSets(otherSets);
for (Integer otherSet : otherSets) {
if (!ParsedList.contains(otherSet)) {
ParseQueue.add(otherSet);
}
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_numberRow .value");
if (!select.isEmpty()) {
card.setCardNumber(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ArtistCredit a");
if (!select.isEmpty()) {
card.setArtist(select.get(0).text().trim());
}
} catch (Exception e) {
return false;
}
CardsList.add(card);
return true;
}
@Override
public void run() {
while (!ParseQueue.isEmpty()) {
Integer multiverseId = ParseQueue.remove();
if (!ParsedList.contains(multiverseId)) {
ParsedList.add(multiverseId);
parseCard(multiverseId);
}
}
ThreadStarter.threadDone();
}
}

View file

@ -0,0 +1,26 @@
package north.gatherercrawler;
import java.util.Iterator;
import java.util.concurrent.ConcurrentSkipListSet;
/**
*
* @author North
*/
public class CardsList {
private static final CardsList instance = new CardsList();
private ConcurrentSkipListSet<Card> list;
public CardsList() {
list = new ConcurrentSkipListSet<Card>();
}
public static void add(Card element) {
instance.list.add(element);
}
public static Iterator<Card> iterator() {
return instance.list.iterator();
}
}

View file

@ -0,0 +1,193 @@
package north.gatherercrawler;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* @author robert.biter
*/
public class Main {
private static void readCardsFromFile() {
try {
// Open the file
FileInputStream fstream = new FileInputStream("cards-data.txt");
// Get the object of DataInputStream
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
String strLine;
//Read File Line By Line
while ((strLine = br.readLine()) != null) {
if (strLine.length() > 0) {
Card card = new Card(strLine);
CardsList.add(card);
ParsedList.add(card.getMultiverseId());
}
}
//Close the input stream
in.close();
} catch (Exception e) {//Catch exception if any
System.err.println("Error: " + e.getMessage());
}
}
public static void main(String[] args) throws IOException, InterruptedException {
List<String> sets = new ArrayList<String>();
// sets.add("Alara Reborn");
// sets.add("Alliances");
// sets.add("Antiquities");
// sets.add("Apocalypse");
// sets.add("Arabian Nights");
// sets.add("Archenemy");
// sets.add("Battle Royale Box Set");
// sets.add("Beatdown Box Set");
// sets.add("Betrayers of Kamigawa");
// sets.add("Champions of Kamigawa");
// sets.add("Chronicles");
// sets.add("Classic Sixth Edition");
// sets.add("Coldsnap");
// sets.add("Conflux");
// sets.add("Darksteel");
// sets.add("Dissension");
// sets.add("Duel Decks: Divine vs. Demonic");
// sets.add("Duel Decks: Elspeth vs. Tezzeret");
// sets.add("Duel Decks: Elves vs. Goblins");
// sets.add("Duel Decks: Garruk vs. Liliana");
// sets.add("Duel Decks: Jace vs. Chandra");
// sets.add("Duel Decks: Knights vs. Dragons");
// sets.add("Duel Decks: Phyrexia vs. the Coalition");
// sets.add("Eighth Edition");
// sets.add("Eventide");
// sets.add("Exodus");
// sets.add("Fallen Empires");
// sets.add("Fifth Dawn");
// sets.add("Fifth Edition");
// sets.add("Fourth Edition");
// sets.add("From the Vault: Dragons");
// sets.add("From the Vault: Exiled");
// sets.add("From the Vault: Relics");
// sets.add("Future Sight");
// sets.add("Guildpact");
// sets.add("Homelands");
// sets.add("Ice Age");
// sets.add("Invasion");
// sets.add("Judgment");
// sets.add("Legends");
// sets.add("Legions");
// sets.add("Limited Edition Alpha");
// sets.add("Limited Edition Beta");
// sets.add("Lorwyn");
// sets.add("Magic 2010");
// sets.add("Magic 2011");
// sets.add("Magic 2012");
// sets.add("Masters Edition");
// sets.add("Masters Edition II");
// sets.add("Masters Edition III");
// sets.add("Masters Edition IV");
// sets.add("Mercadian Masques");
// sets.add("Mirage");
// sets.add("Mirrodin");
// sets.add("Mirrodin Besieged");
// sets.add("Morningtide");
// sets.add("Nemesis");
// sets.add("New Phyrexia");
// sets.add("Ninth Edition");
// sets.add("Odyssey");
// sets.add("Onslaught");
// sets.add("Planar Chaos");
// sets.add("Planechase");
// sets.add("Planeshift");
// sets.add("Portal");
// sets.add("Portal Second Age");
// sets.add("Portal Three Kingdoms");
// sets.add("Premium Deck Series: Fire and Lightning");
// sets.add("Premium Deck Series: Slivers");
// sets.add("Promo set for Gatherer");
// sets.add("Prophecy");
// sets.add("Ravnica: City of Guilds");
// sets.add("Revised Edition");
// sets.add("Rise of the Eldrazi");
// sets.add("Saviors of Kamigawa");
// sets.add("Scars of Mirrodin");
// sets.add("Scourge");
// sets.add("Seventh Edition");
// sets.add("Shadowmoor");
// sets.add("Shards of Alara");
// sets.add("Starter 1999");
// sets.add("Starter 2000");
// sets.add("Stronghold");
// sets.add("Tempest");
// sets.add("Tenth Edition");
// sets.add("The Dark");
// sets.add("Time Spiral");
// sets.add("Time Spiral \"Timeshifted\"");
// sets.add("Torment");
// sets.add("Unlimited Edition");
// sets.add("Urza's Destiny");
// sets.add("Urza's Legacy");
// sets.add("Urza's Saga");
// sets.add("Vanguard");
// sets.add("Visions");
// sets.add("Weatherlight");
// sets.add("Worldwake");
// sets.add("Zendikar");
// sets.add("Magic: The Gathering-Commander");
// sets.add("Unglued");
// sets.add("Unhinged");
readCardsFromFile();
StringBuilder sb = new StringBuilder();
int added = 0;
for (String set : sets) {
sb.append("|[\"").append(set.replace(" ", "+")).append("\"]");
added++;
if (added % 20 == 0 || added == sets.size()) {
int retries = 30;
boolean done = false;
while (retries > 0 && !done) {
String url = "http://gatherer.wizards.com/Pages/Search/Default.aspx?action=advanced&output=checklist&set=" + sb.toString();
Connection connection = Jsoup.connect(url);
connection.timeout(300000);
Document doc = connection.get();
System.out.println(url);
Elements select = doc.select(".checklist .name a");
if (!select.isEmpty()) {
for (Element element : select) {
Integer multiverseId = Integer.parseInt(element.attr("href").replace("../Card/Details.aspx?multiverseid=", ""));
if (!ParsedList.contains(multiverseId)) {
ParseQueue.add(multiverseId);
}
}
}
done = true;
}
if (!done) {
System.out.println("Error accured");
}
sb = new StringBuilder();
Thread.sleep(1000);
}
}
Thread t = new ThreadStarter();
t.start();
}
}

View file

@ -0,0 +1,29 @@
package north.gatherercrawler;
import java.util.concurrent.ConcurrentLinkedQueue;
/**
*
* @author North
*/
public class ParseQueue {
private static final ParseQueue instance = new ParseQueue();
private ConcurrentLinkedQueue<Integer> queue;
public ParseQueue() {
queue = new ConcurrentLinkedQueue<Integer>();
}
public static void add(Integer element) {
instance.queue.add(element);
}
public static Integer remove() {
return instance.queue.remove();
}
public static boolean isEmpty(){
return instance.queue.isEmpty();
}
}

View file

@ -0,0 +1,25 @@
package north.gatherercrawler;
import java.util.concurrent.ConcurrentSkipListSet;
/**
*
* @author North
*/
public class ParsedList {
private static final ParsedList instance = new ParsedList();
private ConcurrentSkipListSet<Integer> list;
public ParsedList() {
list = new ConcurrentSkipListSet<Integer>();
}
public static void add(Integer element) {
instance.list.add(element);
}
public static boolean contains(Integer value) {
return instance.list.contains(value);
}
}

View file

@ -0,0 +1,120 @@
package north.gatherercrawler;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.util.Iterator;
import java.util.List;
/**
*
* @author North
*/
public class ThreadStarter extends Thread {
private static Integer threadsDone = 0;
private final Integer threads = 10;
public static synchronized void threadDone() {
threadsDone++;
}
private void writeCardsToFile() {
try {
FileWriter fstream = new FileWriter("cards-data.txt");
BufferedWriter out = new BufferedWriter(fstream);
Iterator<Card> iterator = CardsList.iterator();
while (iterator.hasNext()) {
out.write(iterator.next().toString());
out.newLine();
}
out.close();
} catch (Exception e) {
System.err.println("Error: " + e.getMessage());
}
}
private void writeCardsToUtilFile() {
try {
FileWriter fstream = new FileWriter("mtg-cards-data.txt");
BufferedWriter out = new BufferedWriter(fstream);
Iterator<Card> iterator = CardsList.iterator();
while (iterator.hasNext()) {
Card card = iterator.next();
StringBuilder sb = new StringBuilder();
sb.append(card.getName()).append("|");
sb.append(card.getExpansion()).append("|");
sb.append(card.getCardNumber() != null ? card.getCardNumber() : "").append("|");
String rarity = card.getRarity() != null ? card.getRarity() : "";
if (rarity.equalsIgnoreCase("Mythic Rare")) {
rarity = "M";
}
if (rarity.equalsIgnoreCase("Rare")) {
rarity = "R";
}
if (rarity.equalsIgnoreCase("Uncommon")) {
rarity = "U";
}
if (rarity.equalsIgnoreCase("Common")) {
rarity = "C";
}
if (rarity.equalsIgnoreCase("Basic Land")) {
rarity = "L";
}
sb.append(rarity).append("|");
List<String> manaCost = card.getManaCost();
for (String cost : manaCost) {
if (!cost.isEmpty()) {
sb.append("{").append(cost).append("}");
}
}
sb.append("|");
sb.append(card.getTypes()).append("|");
String pts = card.getPowerToughness();
if (pts != null && pts.length() > 1) {
String[] pt = pts.split("/");
sb.append(pt[0].trim()).append("|");
sb.append(pt[1].trim()).append("|");
} else {
sb.append("||");
}
List<String> cardText = card.getCardText();
for (int i = 0; i < cardText.size(); i++) {
sb.append(cardText.get(i));
if (i < cardText.size() - 1) {
sb.append("$");
}
}
sb.append("|");
out.write(sb.toString());
out.newLine();
}
out.close();
} catch (Exception e) {
System.err.println("Error: " + e.getMessage());
}
}
@Override
public void run() {
for (int i = 0; i < threads; i++) {
Thread t = new CardParser();
t.start();
}
while (threads != threadsDone) {
try {
synchronized (this) {
this.wait(5000);
}
} catch (InterruptedException ex) {
}
}
writeCardsToFile();
writeCardsToUtilFile();
}
}