[GathererCrawler] updated to support flip cards, split cards and double sided cards.

This commit is contained in:
North 2012-01-22 13:48:34 +02:00
parent 5be4f1c291
commit d0a934f2dc
7 changed files with 190 additions and 112 deletions

View file

@ -22,13 +22,14 @@ public class Card implements Comparable<Card> {
private String rarity;
private String cardNumber;
private String artist;
private Card otherSide;
public Card(Integer multiverseId) {
this.multiverseId = multiverseId;
}
public Card(String card) {
String[] split = card.split("\\|",13);
String[] split = card.split("\\|", 13);
if (split[0].length() > 0) {
multiverseId = Integer.parseInt(split[0]);
}
@ -160,6 +161,14 @@ public class Card implements Comparable<Card> {
this.types = types;
}
public Card getOtherSide() {
return otherSide;
}
public void setOtherSide(Card otherSide) {
this.otherSide = otherSide;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
@ -193,10 +202,15 @@ public class Card implements Comparable<Card> {
sb.append(rarity != null ? rarity : "").append("|");
sb.append(cardNumber != null ? cardNumber : "").append("|");
sb.append(artist != null ? artist : "");
if (otherSide != null) {
sb.append("\n").append(otherSide.toString());
}
return sb.toString();
}
public int compareTo(Card o) {
return this.multiverseId.compareTo(o.getMultiverseId());
int idCompareResult = this.multiverseId.compareTo(o.getMultiverseId());
return idCompareResult == 0 ? this.cardNumber.compareTo(o.getCardNumber()) : idCompareResult;
}
}

View file

@ -5,6 +5,9 @@ import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import north.gatherercrawler.util.CardsList;
import north.gatherercrawler.util.ParseQueue;
import north.gatherercrawler.util.ParsedList;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
@ -22,11 +25,10 @@ public class CardParser extends Thread {
private boolean parseCard(Integer multiverseId) {
String url = "http://gatherer.wizards.com/Pages/Card/Details.aspx?multiverseid=" + multiverseId;
Card card = new Card(multiverseId);
Card card;
Document doc = null;
int retries = 30;
boolean done = false;
while (retries > 0 && !done) {
while (!done) {
try {
Connection connection = Jsoup.connect(url);
connection.timeout(20000);
@ -41,113 +43,155 @@ public class CardParser extends Thread {
}
try {
Elements select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContentHeader_subtitleDisplay");
String cardName = "";
String selectorModifier = "";
Elements select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow .value");
if (!select.isEmpty()) {
cardName = select.get(0).text().trim();
}
card = extractCardData(doc, "", multiverseId);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_rightCol ul li a");
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow .value");
if (!select.isEmpty()) {
card.setName(select.get(0).text().trim());
// for multi-part cards
if (!select.isEmpty()) {
String href = select.attr("href");
url = "http://gatherer.wizards.com/Pages/Card/Details.aspx" + href.substring(href.indexOf("?"));
done = false;
while (!done) {
try {
Connection connection = Jsoup.connect(url);
connection.timeout(20000);
doc = connection.get();
} catch (IOException ex) {
}
done = true;
}
if (!done) {
System.out.println("Card get exception: " + multiverseId);
} else {
card.setCardNumber(card.getCardNumber() + "b");
Card cardSide = extractCardData(doc, "", multiverseId);
cardSide.setCardNumber(cardSide.getCardNumber() + "a");
cardSide.setOtherSide(card);
card = cardSide;
}
}
} else {
card.setName(cardName);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ctl05_nameRow .value");
if (!select.isEmpty() && select.get(0).text().trim().equals(cardName)) {
selectorModifier = "_ctl05";
} else {
selectorModifier = "_ctl06";
// for flip / double sided cards
card = extractCardData(doc, "_ctl05", multiverseId);
if (card == null) {
return false;
}
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_manaRow .value img");
List<String> manaCost = new ArrayList<String>();
if (!select.isEmpty()) {
for (Element element : select) {
manaCost.add(element.attr("src").replace("/Handlers/Image.ashx?size=medium&name=", "").replace("&type=symbol", "").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", ""));
card.setOtherSide(extractCardData(doc, "_ctl06", multiverseId));
if (card.getOtherSide() == null) {
return false;
}
}
card.setManaCost(manaCost);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_cmcRow .value");
if (!select.isEmpty()) {
card.setConvertedManaCost(Integer.parseInt(select.get(0).text().trim()));
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_typeRow .value");
if (!select.isEmpty()) {
card.setTypes(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_textRow .value .cardtextbox");
List<String> cardText = new ArrayList<String>();
if (!select.isEmpty()) {
for (Element element : select) {
cardText.add(element.html().trim().replace("<img src=\"/Handlers/Image.ashx?size=small&amp;name=", "{").replace("&amp;type=symbol", "}").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", "").replace("\n", "").replace("&quot;", "\""));
}
}
card.setCardText(cardText);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_FlavorText .cardtextbox");
List<String> flavorText = new ArrayList<String>();
if (!select.isEmpty()) {
for (Element element : select) {
flavorText.add(element.html().trim().replace("&quot;", "\"").replace("<i>", "").replace("</i>", ""));
}
}
card.setFlavorText(flavorText);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ptRow .value");
if (!select.isEmpty()) {
card.setPowerToughness(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_currentSetSymbol a");
if (!select.isEmpty()) {
card.setExpansion(select.get(1).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_rarityRow .value span");
if (!select.isEmpty()) {
card.setRarity(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_otherSetsValue a");
List<Integer> otherSets = new ArrayList<Integer>();
if (!select.isEmpty()) {
for (Element element : select) {
otherSets.add(Integer.parseInt(element.attr("href").replace("Details.aspx?multiverseid=", "")));
}
}
// card.setOtherSets(otherSets);
for (Integer otherSet : otherSets) {
if (!ParsedList.contains(otherSet)) {
ParseQueue.add(otherSet);
}
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_numberRow .value");
if (!select.isEmpty()) {
card.setCardNumber(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ArtistCredit a");
if (!select.isEmpty()) {
card.setArtist(select.get(0).text().trim());
}
} catch (Exception e) {
return false;
}
if (card == null) {
return false;
}
CardsList.add(card);
return true;
}
private Card extractCardData(Document doc, String selectorModifier, Integer id) throws NumberFormatException {
Elements select;
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_cardImage");
Integer multiverseId = null;
if (!select.isEmpty()) {
Pattern pattern = Pattern.compile("(?<=multiverseid=)\\d+");
Matcher matcher = pattern.matcher(select.get(0).attr("src"));
if (matcher.find()) {
multiverseId = Integer.parseInt(matcher.group());
}
}
if (multiverseId == null) {
return null;
}
Card card = new Card(multiverseId);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_nameRow .value");
if (!select.isEmpty()) {
card.setName(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_manaRow .value img");
List<String> manaCost = new ArrayList<String>();
if (!select.isEmpty()) {
for (Element element : select) {
manaCost.add(element.attr("src").replace("/Handlers/Image.ashx?size=medium&name=", "").replace("&type=symbol", "").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", ""));
}
}
card.setManaCost(manaCost);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_cmcRow .value");
if (!select.isEmpty()) {
card.setConvertedManaCost(Integer.parseInt(select.get(0).text().trim()));
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_typeRow .value");
if (!select.isEmpty()) {
card.setTypes(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_textRow .value .cardtextbox");
List<String> cardText = new ArrayList<String>();
if (!select.isEmpty()) {
for (Element element : select) {
cardText.add(element.html().trim().replace("<img src=\"/Handlers/Image.ashx?size=small&amp;name=", "{").replace("&amp;type=symbol", "}").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", "").replace("\n", "").replace("&quot;", "\""));
}
}
card.setCardText(cardText);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_FlavorText .cardtextbox");
List<String> flavorText = new ArrayList<String>();
if (!select.isEmpty()) {
for (Element element : select) {
flavorText.add(element.html().trim().replace("&quot;", "\"").replace("<i>", "").replace("</i>", ""));
}
}
card.setFlavorText(flavorText);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ptRow .value");
if (!select.isEmpty()) {
card.setPowerToughness(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_currentSetSymbol a");
if (!select.isEmpty()) {
card.setExpansion(select.get(1).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_rarityRow .value span");
if (!select.isEmpty()) {
card.setRarity(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_otherSetsValue a");
List<Integer> otherSets = new ArrayList<Integer>();
if (!select.isEmpty()) {
for (Element element : select) {
otherSets.add(Integer.parseInt(element.attr("href").replace("Details.aspx?multiverseid=", "")));
}
}
// card.setOtherSets(otherSets);
for (Integer otherSet : otherSets) {
if (!ParsedList.contains(otherSet)) {
ParseQueue.add(otherSet);
}
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_numberRow .value");
if (!select.isEmpty()) {
card.setCardNumber(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ArtistCredit a");
if (!select.isEmpty()) {
card.setArtist(select.get(0).text().trim());
}
if (card.getCardNumber() == null) {
url = "http://magiccards.info/query?q=" + card.getName().replace(' ', '+');
String url = "http://magiccards.info/query?q=" + card.getName().replace(' ', '+');
try {
Connection connection = Jsoup.connect(url);
connection.timeout(20000);
doc = connection.get();
Elements select = doc.select("small a:contains(" + card.getExpansion() + ")");
select = doc.select("small a:contains(" + card.getExpansion() + ")");
if (!select.isEmpty()) {
Matcher matcher = patternUrl.matcher(select.get(0).attr("href"));
matcher.find();
@ -166,7 +210,7 @@ public class CardParser extends Thread {
}
if (card.getCardNumber() == null) {
Elements select = doc.select("p a:contains(" + card.getExpansion() + ")");
select = doc.select("p a:contains(" + card.getExpansion() + ")");
if (!select.isEmpty()) {
Matcher matcher = patternUrl.matcher(select.get(0).attr("href"));
matcher.find();
@ -183,8 +227,8 @@ public class CardParser extends Thread {
System.out.println("Card number missing: " + card.getName());
}
}
CardsList.add(card);
return true;
return card;
}
@Override

View file

@ -1,12 +1,11 @@
package north.gatherercrawler;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import north.gatherercrawler.util.CardsList;
import north.gatherercrawler.util.ParseQueue;
import north.gatherercrawler.util.ParsedList;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

View file

@ -2,8 +2,8 @@ package north.gatherercrawler;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.util.Iterator;
import java.util.List;
import java.util.*;
import north.gatherercrawler.util.CardsList;
/**
*
@ -13,16 +13,35 @@ public class ThreadStarter extends Thread {
private static Integer threadsDone = 0;
private final Integer threads = 10;
private List<Card> sortedCards;
public static synchronized void threadDone() {
threadsDone++;
}
private void updateSortedCards() {
if (sortedCards == null) {
sortedCards = new ArrayList<Card>();
Iterator<Card> iterator = CardsList.iterator();
while (iterator.hasNext()) {
sortedCards.add(iterator.next());
}
Collections.sort(sortedCards, new Comparator<Card>() {
public int compare(Card o1, Card o2) {
int expansionCompare = o1.getExpansion().compareTo(o2.getExpansion());
return expansionCompare != 0 ? expansionCompare : o1.getCardNumber().compareTo(o2.getCardNumber());
}
});
}
}
private void writeCardsToFile() {
try {
FileWriter fstream = new FileWriter("cards-data.txt");
BufferedWriter out = new BufferedWriter(fstream);
Iterator<Card> iterator = CardsList.iterator();
Iterator<Card> iterator = sortedCards.iterator();
while (iterator.hasNext()) {
out.write(iterator.next().toString());
out.newLine();
@ -37,7 +56,7 @@ public class ThreadStarter extends Thread {
try {
FileWriter fstream = new FileWriter("mtg-cards-data.txt");
BufferedWriter out = new BufferedWriter(fstream);
Iterator<Card> iterator = CardsList.iterator();
Iterator<Card> iterator = sortedCards.iterator();
while (iterator.hasNext()) {
Card card = iterator.next();
StringBuilder sb = new StringBuilder();
@ -79,7 +98,7 @@ public class ThreadStarter extends Thread {
} else {
sb.append("||");
}
List<String> cardText = card.getCardText();
for (int i = 0; i < cardText.size(); i++) {
sb.append(cardText.get(i));
@ -114,6 +133,7 @@ public class ThreadStarter extends Thread {
}
}
updateSortedCards();
writeCardsToFile();
writeCardsToUtilFile();
}

View file

@ -1,7 +1,8 @@
package north.gatherercrawler;
package north.gatherercrawler.util;
import java.util.Iterator;
import java.util.concurrent.ConcurrentSkipListSet;
import north.gatherercrawler.Card;
/**
*

View file

@ -1,4 +1,4 @@
package north.gatherercrawler;
package north.gatherercrawler.util;
import java.util.concurrent.ConcurrentLinkedQueue;

View file

@ -1,4 +1,4 @@
package north.gatherercrawler;
package north.gatherercrawler.util;
import java.util.concurrent.ConcurrentSkipListSet;