[GathererCrawler] updated to support flip cards, split cards and double sided cards.

This commit is contained in:
North 2012-01-22 13:48:34 +02:00
parent 5be4f1c291
commit d0a934f2dc
7 changed files with 190 additions and 112 deletions

View file

@ -22,13 +22,14 @@ public class Card implements Comparable<Card> {
private String rarity; private String rarity;
private String cardNumber; private String cardNumber;
private String artist; private String artist;
private Card otherSide;
public Card(Integer multiverseId) { public Card(Integer multiverseId) {
this.multiverseId = multiverseId; this.multiverseId = multiverseId;
} }
public Card(String card) { public Card(String card) {
String[] split = card.split("\\|",13); String[] split = card.split("\\|", 13);
if (split[0].length() > 0) { if (split[0].length() > 0) {
multiverseId = Integer.parseInt(split[0]); multiverseId = Integer.parseInt(split[0]);
} }
@ -160,6 +161,14 @@ public class Card implements Comparable<Card> {
this.types = types; this.types = types;
} }
public Card getOtherSide() {
return otherSide;
}
public void setOtherSide(Card otherSide) {
this.otherSide = otherSide;
}
@Override @Override
public String toString() { public String toString() {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
@ -193,10 +202,15 @@ public class Card implements Comparable<Card> {
sb.append(rarity != null ? rarity : "").append("|"); sb.append(rarity != null ? rarity : "").append("|");
sb.append(cardNumber != null ? cardNumber : "").append("|"); sb.append(cardNumber != null ? cardNumber : "").append("|");
sb.append(artist != null ? artist : ""); sb.append(artist != null ? artist : "");
if (otherSide != null) {
sb.append("\n").append(otherSide.toString());
}
return sb.toString(); return sb.toString();
} }
public int compareTo(Card o) { public int compareTo(Card o) {
return this.multiverseId.compareTo(o.getMultiverseId()); int idCompareResult = this.multiverseId.compareTo(o.getMultiverseId());
return idCompareResult == 0 ? this.cardNumber.compareTo(o.getCardNumber()) : idCompareResult;
} }
} }

View file

@ -5,6 +5,9 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import north.gatherercrawler.util.CardsList;
import north.gatherercrawler.util.ParseQueue;
import north.gatherercrawler.util.ParsedList;
import org.jsoup.Connection; import org.jsoup.Connection;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
@ -22,11 +25,10 @@ public class CardParser extends Thread {
private boolean parseCard(Integer multiverseId) { private boolean parseCard(Integer multiverseId) {
String url = "http://gatherer.wizards.com/Pages/Card/Details.aspx?multiverseid=" + multiverseId; String url = "http://gatherer.wizards.com/Pages/Card/Details.aspx?multiverseid=" + multiverseId;
Card card = new Card(multiverseId); Card card;
Document doc = null; Document doc = null;
int retries = 30;
boolean done = false; boolean done = false;
while (retries > 0 && !done) { while (!done) {
try { try {
Connection connection = Jsoup.connect(url); Connection connection = Jsoup.connect(url);
connection.timeout(20000); connection.timeout(20000);
@ -41,113 +43,155 @@ public class CardParser extends Thread {
} }
try { try {
Elements select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContentHeader_subtitleDisplay"); Elements select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow .value");
String cardName = "";
String selectorModifier = "";
if (!select.isEmpty()) { if (!select.isEmpty()) {
cardName = select.get(0).text().trim(); card = extractCardData(doc, "", multiverseId);
} select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_rightCol ul li a");
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow .value"); // for multi-part cards
if (!select.isEmpty()) { if (!select.isEmpty()) {
card.setName(select.get(0).text().trim()); String href = select.attr("href");
url = "http://gatherer.wizards.com/Pages/Card/Details.aspx" + href.substring(href.indexOf("?"));
done = false;
while (!done) {
try {
Connection connection = Jsoup.connect(url);
connection.timeout(20000);
doc = connection.get();
} catch (IOException ex) {
}
done = true;
}
if (!done) {
System.out.println("Card get exception: " + multiverseId);
} else {
card.setCardNumber(card.getCardNumber() + "b");
Card cardSide = extractCardData(doc, "", multiverseId);
cardSide.setCardNumber(cardSide.getCardNumber() + "a");
cardSide.setOtherSide(card);
card = cardSide;
}
}
} else { } else {
card.setName(cardName); // for flip / double sided cards
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ctl05_nameRow .value"); card = extractCardData(doc, "_ctl05", multiverseId);
if (!select.isEmpty() && select.get(0).text().trim().equals(cardName)) { if (card == null) {
selectorModifier = "_ctl05"; return false;
} else {
selectorModifier = "_ctl06";
} }
} card.setOtherSide(extractCardData(doc, "_ctl06", multiverseId));
if (card.getOtherSide() == null) {
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_manaRow .value img"); return false;
List<String> manaCost = new ArrayList<String>();
if (!select.isEmpty()) {
for (Element element : select) {
manaCost.add(element.attr("src").replace("/Handlers/Image.ashx?size=medium&name=", "").replace("&type=symbol", "").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", ""));
} }
} }
card.setManaCost(manaCost);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_cmcRow .value");
if (!select.isEmpty()) {
card.setConvertedManaCost(Integer.parseInt(select.get(0).text().trim()));
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_typeRow .value");
if (!select.isEmpty()) {
card.setTypes(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_textRow .value .cardtextbox");
List<String> cardText = new ArrayList<String>();
if (!select.isEmpty()) {
for (Element element : select) {
cardText.add(element.html().trim().replace("<img src=\"/Handlers/Image.ashx?size=small&amp;name=", "{").replace("&amp;type=symbol", "}").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", "").replace("\n", "").replace("&quot;", "\""));
}
}
card.setCardText(cardText);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_FlavorText .cardtextbox");
List<String> flavorText = new ArrayList<String>();
if (!select.isEmpty()) {
for (Element element : select) {
flavorText.add(element.html().trim().replace("&quot;", "\"").replace("<i>", "").replace("</i>", ""));
}
}
card.setFlavorText(flavorText);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ptRow .value");
if (!select.isEmpty()) {
card.setPowerToughness(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_currentSetSymbol a");
if (!select.isEmpty()) {
card.setExpansion(select.get(1).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_rarityRow .value span");
if (!select.isEmpty()) {
card.setRarity(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_otherSetsValue a");
List<Integer> otherSets = new ArrayList<Integer>();
if (!select.isEmpty()) {
for (Element element : select) {
otherSets.add(Integer.parseInt(element.attr("href").replace("Details.aspx?multiverseid=", "")));
}
}
// card.setOtherSets(otherSets);
for (Integer otherSet : otherSets) {
if (!ParsedList.contains(otherSet)) {
ParseQueue.add(otherSet);
}
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_numberRow .value");
if (!select.isEmpty()) {
card.setCardNumber(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ArtistCredit a");
if (!select.isEmpty()) {
card.setArtist(select.get(0).text().trim());
}
} catch (Exception e) { } catch (Exception e) {
return false; return false;
} }
if (card == null) {
return false;
}
CardsList.add(card);
return true;
}
private Card extractCardData(Document doc, String selectorModifier, Integer id) throws NumberFormatException {
Elements select;
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_cardImage");
Integer multiverseId = null;
if (!select.isEmpty()) {
Pattern pattern = Pattern.compile("(?<=multiverseid=)\\d+");
Matcher matcher = pattern.matcher(select.get(0).attr("src"));
if (matcher.find()) {
multiverseId = Integer.parseInt(matcher.group());
}
}
if (multiverseId == null) {
return null;
}
Card card = new Card(multiverseId);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_nameRow .value");
if (!select.isEmpty()) {
card.setName(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_manaRow .value img");
List<String> manaCost = new ArrayList<String>();
if (!select.isEmpty()) {
for (Element element : select) {
manaCost.add(element.attr("src").replace("/Handlers/Image.ashx?size=medium&name=", "").replace("&type=symbol", "").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", ""));
}
}
card.setManaCost(manaCost);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_cmcRow .value");
if (!select.isEmpty()) {
card.setConvertedManaCost(Integer.parseInt(select.get(0).text().trim()));
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_typeRow .value");
if (!select.isEmpty()) {
card.setTypes(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_textRow .value .cardtextbox");
List<String> cardText = new ArrayList<String>();
if (!select.isEmpty()) {
for (Element element : select) {
cardText.add(element.html().trim().replace("<img src=\"/Handlers/Image.ashx?size=small&amp;name=", "{").replace("&amp;type=symbol", "}").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", "").replace("\n", "").replace("&quot;", "\""));
}
}
card.setCardText(cardText);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_FlavorText .cardtextbox");
List<String> flavorText = new ArrayList<String>();
if (!select.isEmpty()) {
for (Element element : select) {
flavorText.add(element.html().trim().replace("&quot;", "\"").replace("<i>", "").replace("</i>", ""));
}
}
card.setFlavorText(flavorText);
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ptRow .value");
if (!select.isEmpty()) {
card.setPowerToughness(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_currentSetSymbol a");
if (!select.isEmpty()) {
card.setExpansion(select.get(1).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_rarityRow .value span");
if (!select.isEmpty()) {
card.setRarity(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_otherSetsValue a");
List<Integer> otherSets = new ArrayList<Integer>();
if (!select.isEmpty()) {
for (Element element : select) {
otherSets.add(Integer.parseInt(element.attr("href").replace("Details.aspx?multiverseid=", "")));
}
}
// card.setOtherSets(otherSets);
for (Integer otherSet : otherSets) {
if (!ParsedList.contains(otherSet)) {
ParseQueue.add(otherSet);
}
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_numberRow .value");
if (!select.isEmpty()) {
card.setCardNumber(select.get(0).text().trim());
}
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ArtistCredit a");
if (!select.isEmpty()) {
card.setArtist(select.get(0).text().trim());
}
if (card.getCardNumber() == null) { if (card.getCardNumber() == null) {
url = "http://magiccards.info/query?q=" + card.getName().replace(' ', '+'); String url = "http://magiccards.info/query?q=" + card.getName().replace(' ', '+');
try { try {
Connection connection = Jsoup.connect(url); Connection connection = Jsoup.connect(url);
connection.timeout(20000); connection.timeout(20000);
doc = connection.get(); doc = connection.get();
Elements select = doc.select("small a:contains(" + card.getExpansion() + ")"); select = doc.select("small a:contains(" + card.getExpansion() + ")");
if (!select.isEmpty()) { if (!select.isEmpty()) {
Matcher matcher = patternUrl.matcher(select.get(0).attr("href")); Matcher matcher = patternUrl.matcher(select.get(0).attr("href"));
matcher.find(); matcher.find();
@ -166,7 +210,7 @@ public class CardParser extends Thread {
} }
if (card.getCardNumber() == null) { if (card.getCardNumber() == null) {
Elements select = doc.select("p a:contains(" + card.getExpansion() + ")"); select = doc.select("p a:contains(" + card.getExpansion() + ")");
if (!select.isEmpty()) { if (!select.isEmpty()) {
Matcher matcher = patternUrl.matcher(select.get(0).attr("href")); Matcher matcher = patternUrl.matcher(select.get(0).attr("href"));
matcher.find(); matcher.find();
@ -183,8 +227,8 @@ public class CardParser extends Thread {
System.out.println("Card number missing: " + card.getName()); System.out.println("Card number missing: " + card.getName());
} }
} }
CardsList.add(card);
return true; return card;
} }
@Override @Override

View file

@ -1,12 +1,11 @@
package north.gatherercrawler; package north.gatherercrawler;
import java.io.BufferedReader; import java.io.*;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import north.gatherercrawler.util.CardsList;
import north.gatherercrawler.util.ParseQueue;
import north.gatherercrawler.util.ParsedList;
import org.jsoup.Connection; import org.jsoup.Connection;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;

View file

@ -2,8 +2,8 @@ package north.gatherercrawler;
import java.io.BufferedWriter; import java.io.BufferedWriter;
import java.io.FileWriter; import java.io.FileWriter;
import java.util.Iterator; import java.util.*;
import java.util.List; import north.gatherercrawler.util.CardsList;
/** /**
* *
@ -13,16 +13,35 @@ public class ThreadStarter extends Thread {
private static Integer threadsDone = 0; private static Integer threadsDone = 0;
private final Integer threads = 10; private final Integer threads = 10;
private List<Card> sortedCards;
public static synchronized void threadDone() { public static synchronized void threadDone() {
threadsDone++; threadsDone++;
} }
private void updateSortedCards() {
if (sortedCards == null) {
sortedCards = new ArrayList<Card>();
Iterator<Card> iterator = CardsList.iterator();
while (iterator.hasNext()) {
sortedCards.add(iterator.next());
}
Collections.sort(sortedCards, new Comparator<Card>() {
public int compare(Card o1, Card o2) {
int expansionCompare = o1.getExpansion().compareTo(o2.getExpansion());
return expansionCompare != 0 ? expansionCompare : o1.getCardNumber().compareTo(o2.getCardNumber());
}
});
}
}
private void writeCardsToFile() { private void writeCardsToFile() {
try { try {
FileWriter fstream = new FileWriter("cards-data.txt"); FileWriter fstream = new FileWriter("cards-data.txt");
BufferedWriter out = new BufferedWriter(fstream); BufferedWriter out = new BufferedWriter(fstream);
Iterator<Card> iterator = CardsList.iterator(); Iterator<Card> iterator = sortedCards.iterator();
while (iterator.hasNext()) { while (iterator.hasNext()) {
out.write(iterator.next().toString()); out.write(iterator.next().toString());
out.newLine(); out.newLine();
@ -37,7 +56,7 @@ public class ThreadStarter extends Thread {
try { try {
FileWriter fstream = new FileWriter("mtg-cards-data.txt"); FileWriter fstream = new FileWriter("mtg-cards-data.txt");
BufferedWriter out = new BufferedWriter(fstream); BufferedWriter out = new BufferedWriter(fstream);
Iterator<Card> iterator = CardsList.iterator(); Iterator<Card> iterator = sortedCards.iterator();
while (iterator.hasNext()) { while (iterator.hasNext()) {
Card card = iterator.next(); Card card = iterator.next();
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
@ -79,7 +98,7 @@ public class ThreadStarter extends Thread {
} else { } else {
sb.append("||"); sb.append("||");
} }
List<String> cardText = card.getCardText(); List<String> cardText = card.getCardText();
for (int i = 0; i < cardText.size(); i++) { for (int i = 0; i < cardText.size(); i++) {
sb.append(cardText.get(i)); sb.append(cardText.get(i));
@ -114,6 +133,7 @@ public class ThreadStarter extends Thread {
} }
} }
updateSortedCards();
writeCardsToFile(); writeCardsToFile();
writeCardsToUtilFile(); writeCardsToUtilFile();
} }

View file

@ -1,7 +1,8 @@
package north.gatherercrawler; package north.gatherercrawler.util;
import java.util.Iterator; import java.util.Iterator;
import java.util.concurrent.ConcurrentSkipListSet; import java.util.concurrent.ConcurrentSkipListSet;
import north.gatherercrawler.Card;
/** /**
* *

View file

@ -1,4 +1,4 @@
package north.gatherercrawler; package north.gatherercrawler.util;
import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ConcurrentLinkedQueue;

View file

@ -1,4 +1,4 @@
package north.gatherercrawler; package north.gatherercrawler.util;
import java.util.concurrent.ConcurrentSkipListSet; import java.util.concurrent.ConcurrentSkipListSet;