mirror of
https://github.com/correl/mage.git
synced 2024-12-24 03:00:14 +00:00
[GathererCrawler] updated to support flip cards, split cards and double sided cards.
This commit is contained in:
parent
5be4f1c291
commit
d0a934f2dc
7 changed files with 190 additions and 112 deletions
|
@ -22,13 +22,14 @@ public class Card implements Comparable<Card> {
|
|||
private String rarity;
|
||||
private String cardNumber;
|
||||
private String artist;
|
||||
private Card otherSide;
|
||||
|
||||
public Card(Integer multiverseId) {
|
||||
this.multiverseId = multiverseId;
|
||||
}
|
||||
|
||||
public Card(String card) {
|
||||
String[] split = card.split("\\|",13);
|
||||
String[] split = card.split("\\|", 13);
|
||||
if (split[0].length() > 0) {
|
||||
multiverseId = Integer.parseInt(split[0]);
|
||||
}
|
||||
|
@ -160,6 +161,14 @@ public class Card implements Comparable<Card> {
|
|||
this.types = types;
|
||||
}
|
||||
|
||||
public Card getOtherSide() {
|
||||
return otherSide;
|
||||
}
|
||||
|
||||
public void setOtherSide(Card otherSide) {
|
||||
this.otherSide = otherSide;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
@ -193,10 +202,15 @@ public class Card implements Comparable<Card> {
|
|||
sb.append(rarity != null ? rarity : "").append("|");
|
||||
sb.append(cardNumber != null ? cardNumber : "").append("|");
|
||||
sb.append(artist != null ? artist : "");
|
||||
|
||||
if (otherSide != null) {
|
||||
sb.append("\n").append(otherSide.toString());
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public int compareTo(Card o) {
|
||||
return this.multiverseId.compareTo(o.getMultiverseId());
|
||||
int idCompareResult = this.multiverseId.compareTo(o.getMultiverseId());
|
||||
return idCompareResult == 0 ? this.cardNumber.compareTo(o.getCardNumber()) : idCompareResult;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,6 +5,9 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import north.gatherercrawler.util.CardsList;
|
||||
import north.gatherercrawler.util.ParseQueue;
|
||||
import north.gatherercrawler.util.ParsedList;
|
||||
import org.jsoup.Connection;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
@ -22,11 +25,10 @@ public class CardParser extends Thread {
|
|||
|
||||
private boolean parseCard(Integer multiverseId) {
|
||||
String url = "http://gatherer.wizards.com/Pages/Card/Details.aspx?multiverseid=" + multiverseId;
|
||||
Card card = new Card(multiverseId);
|
||||
Card card;
|
||||
Document doc = null;
|
||||
int retries = 30;
|
||||
boolean done = false;
|
||||
while (retries > 0 && !done) {
|
||||
while (!done) {
|
||||
try {
|
||||
Connection connection = Jsoup.connect(url);
|
||||
connection.timeout(20000);
|
||||
|
@ -41,113 +43,155 @@ public class CardParser extends Thread {
|
|||
}
|
||||
|
||||
try {
|
||||
Elements select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContentHeader_subtitleDisplay");
|
||||
String cardName = "";
|
||||
String selectorModifier = "";
|
||||
Elements select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow .value");
|
||||
if (!select.isEmpty()) {
|
||||
cardName = select.get(0).text().trim();
|
||||
}
|
||||
card = extractCardData(doc, "", multiverseId);
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_rightCol ul li a");
|
||||
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow .value");
|
||||
if (!select.isEmpty()) {
|
||||
card.setName(select.get(0).text().trim());
|
||||
// for multi-part cards
|
||||
if (!select.isEmpty()) {
|
||||
String href = select.attr("href");
|
||||
url = "http://gatherer.wizards.com/Pages/Card/Details.aspx" + href.substring(href.indexOf("?"));
|
||||
|
||||
done = false;
|
||||
while (!done) {
|
||||
try {
|
||||
Connection connection = Jsoup.connect(url);
|
||||
connection.timeout(20000);
|
||||
doc = connection.get();
|
||||
} catch (IOException ex) {
|
||||
}
|
||||
done = true;
|
||||
}
|
||||
if (!done) {
|
||||
System.out.println("Card get exception: " + multiverseId);
|
||||
} else {
|
||||
card.setCardNumber(card.getCardNumber() + "b");
|
||||
Card cardSide = extractCardData(doc, "", multiverseId);
|
||||
cardSide.setCardNumber(cardSide.getCardNumber() + "a");
|
||||
cardSide.setOtherSide(card);
|
||||
card = cardSide;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
card.setName(cardName);
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ctl05_nameRow .value");
|
||||
if (!select.isEmpty() && select.get(0).text().trim().equals(cardName)) {
|
||||
selectorModifier = "_ctl05";
|
||||
} else {
|
||||
selectorModifier = "_ctl06";
|
||||
// for flip / double sided cards
|
||||
card = extractCardData(doc, "_ctl05", multiverseId);
|
||||
if (card == null) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_manaRow .value img");
|
||||
List<String> manaCost = new ArrayList<String>();
|
||||
if (!select.isEmpty()) {
|
||||
for (Element element : select) {
|
||||
manaCost.add(element.attr("src").replace("/Handlers/Image.ashx?size=medium&name=", "").replace("&type=symbol", "").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", ""));
|
||||
card.setOtherSide(extractCardData(doc, "_ctl06", multiverseId));
|
||||
if (card.getOtherSide() == null) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
card.setManaCost(manaCost);
|
||||
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_cmcRow .value");
|
||||
if (!select.isEmpty()) {
|
||||
card.setConvertedManaCost(Integer.parseInt(select.get(0).text().trim()));
|
||||
}
|
||||
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_typeRow .value");
|
||||
if (!select.isEmpty()) {
|
||||
card.setTypes(select.get(0).text().trim());
|
||||
}
|
||||
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_textRow .value .cardtextbox");
|
||||
List<String> cardText = new ArrayList<String>();
|
||||
if (!select.isEmpty()) {
|
||||
for (Element element : select) {
|
||||
cardText.add(element.html().trim().replace("<img src=\"/Handlers/Image.ashx?size=small&name=", "{").replace("&type=symbol", "}").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", "").replace("\n", "").replace(""", "\""));
|
||||
}
|
||||
}
|
||||
card.setCardText(cardText);
|
||||
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_FlavorText .cardtextbox");
|
||||
List<String> flavorText = new ArrayList<String>();
|
||||
if (!select.isEmpty()) {
|
||||
for (Element element : select) {
|
||||
flavorText.add(element.html().trim().replace(""", "\"").replace("<i>", "").replace("</i>", ""));
|
||||
}
|
||||
}
|
||||
card.setFlavorText(flavorText);
|
||||
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ptRow .value");
|
||||
if (!select.isEmpty()) {
|
||||
card.setPowerToughness(select.get(0).text().trim());
|
||||
}
|
||||
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_currentSetSymbol a");
|
||||
if (!select.isEmpty()) {
|
||||
card.setExpansion(select.get(1).text().trim());
|
||||
}
|
||||
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_rarityRow .value span");
|
||||
if (!select.isEmpty()) {
|
||||
card.setRarity(select.get(0).text().trim());
|
||||
}
|
||||
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_otherSetsValue a");
|
||||
List<Integer> otherSets = new ArrayList<Integer>();
|
||||
if (!select.isEmpty()) {
|
||||
for (Element element : select) {
|
||||
otherSets.add(Integer.parseInt(element.attr("href").replace("Details.aspx?multiverseid=", "")));
|
||||
}
|
||||
}
|
||||
// card.setOtherSets(otherSets);
|
||||
for (Integer otherSet : otherSets) {
|
||||
if (!ParsedList.contains(otherSet)) {
|
||||
ParseQueue.add(otherSet);
|
||||
}
|
||||
}
|
||||
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_numberRow .value");
|
||||
if (!select.isEmpty()) {
|
||||
card.setCardNumber(select.get(0).text().trim());
|
||||
}
|
||||
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ArtistCredit a");
|
||||
if (!select.isEmpty()) {
|
||||
card.setArtist(select.get(0).text().trim());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
if (card == null) {
|
||||
return false;
|
||||
}
|
||||
CardsList.add(card);
|
||||
return true;
|
||||
}
|
||||
|
||||
private Card extractCardData(Document doc, String selectorModifier, Integer id) throws NumberFormatException {
|
||||
Elements select;
|
||||
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_cardImage");
|
||||
Integer multiverseId = null;
|
||||
if (!select.isEmpty()) {
|
||||
Pattern pattern = Pattern.compile("(?<=multiverseid=)\\d+");
|
||||
Matcher matcher = pattern.matcher(select.get(0).attr("src"));
|
||||
if (matcher.find()) {
|
||||
multiverseId = Integer.parseInt(matcher.group());
|
||||
}
|
||||
}
|
||||
if (multiverseId == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
Card card = new Card(multiverseId);
|
||||
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_nameRow .value");
|
||||
if (!select.isEmpty()) {
|
||||
card.setName(select.get(0).text().trim());
|
||||
}
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_manaRow .value img");
|
||||
List<String> manaCost = new ArrayList<String>();
|
||||
if (!select.isEmpty()) {
|
||||
for (Element element : select) {
|
||||
manaCost.add(element.attr("src").replace("/Handlers/Image.ashx?size=medium&name=", "").replace("&type=symbol", "").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", ""));
|
||||
}
|
||||
}
|
||||
card.setManaCost(manaCost);
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_cmcRow .value");
|
||||
if (!select.isEmpty()) {
|
||||
card.setConvertedManaCost(Integer.parseInt(select.get(0).text().trim()));
|
||||
}
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_typeRow .value");
|
||||
if (!select.isEmpty()) {
|
||||
card.setTypes(select.get(0).text().trim());
|
||||
}
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_textRow .value .cardtextbox");
|
||||
List<String> cardText = new ArrayList<String>();
|
||||
if (!select.isEmpty()) {
|
||||
for (Element element : select) {
|
||||
cardText.add(element.html().trim().replace("<img src=\"/Handlers/Image.ashx?size=small&name=", "{").replace("&type=symbol", "}").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", "").replace("\n", "").replace(""", "\""));
|
||||
}
|
||||
}
|
||||
card.setCardText(cardText);
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_FlavorText .cardtextbox");
|
||||
List<String> flavorText = new ArrayList<String>();
|
||||
if (!select.isEmpty()) {
|
||||
for (Element element : select) {
|
||||
flavorText.add(element.html().trim().replace(""", "\"").replace("<i>", "").replace("</i>", ""));
|
||||
}
|
||||
}
|
||||
card.setFlavorText(flavorText);
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ptRow .value");
|
||||
if (!select.isEmpty()) {
|
||||
card.setPowerToughness(select.get(0).text().trim());
|
||||
}
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_currentSetSymbol a");
|
||||
if (!select.isEmpty()) {
|
||||
card.setExpansion(select.get(1).text().trim());
|
||||
}
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_rarityRow .value span");
|
||||
if (!select.isEmpty()) {
|
||||
card.setRarity(select.get(0).text().trim());
|
||||
}
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_otherSetsValue a");
|
||||
List<Integer> otherSets = new ArrayList<Integer>();
|
||||
if (!select.isEmpty()) {
|
||||
for (Element element : select) {
|
||||
otherSets.add(Integer.parseInt(element.attr("href").replace("Details.aspx?multiverseid=", "")));
|
||||
}
|
||||
}
|
||||
// card.setOtherSets(otherSets);
|
||||
for (Integer otherSet : otherSets) {
|
||||
if (!ParsedList.contains(otherSet)) {
|
||||
ParseQueue.add(otherSet);
|
||||
}
|
||||
}
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_numberRow .value");
|
||||
if (!select.isEmpty()) {
|
||||
card.setCardNumber(select.get(0).text().trim());
|
||||
}
|
||||
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ArtistCredit a");
|
||||
if (!select.isEmpty()) {
|
||||
card.setArtist(select.get(0).text().trim());
|
||||
}
|
||||
|
||||
if (card.getCardNumber() == null) {
|
||||
url = "http://magiccards.info/query?q=" + card.getName().replace(' ', '+');
|
||||
String url = "http://magiccards.info/query?q=" + card.getName().replace(' ', '+');
|
||||
try {
|
||||
Connection connection = Jsoup.connect(url);
|
||||
connection.timeout(20000);
|
||||
doc = connection.get();
|
||||
|
||||
Elements select = doc.select("small a:contains(" + card.getExpansion() + ")");
|
||||
select = doc.select("small a:contains(" + card.getExpansion() + ")");
|
||||
if (!select.isEmpty()) {
|
||||
Matcher matcher = patternUrl.matcher(select.get(0).attr("href"));
|
||||
matcher.find();
|
||||
|
@ -166,7 +210,7 @@ public class CardParser extends Thread {
|
|||
}
|
||||
|
||||
if (card.getCardNumber() == null) {
|
||||
Elements select = doc.select("p a:contains(" + card.getExpansion() + ")");
|
||||
select = doc.select("p a:contains(" + card.getExpansion() + ")");
|
||||
if (!select.isEmpty()) {
|
||||
Matcher matcher = patternUrl.matcher(select.get(0).attr("href"));
|
||||
matcher.find();
|
||||
|
@ -183,8 +227,8 @@ public class CardParser extends Thread {
|
|||
System.out.println("Card number missing: " + card.getName());
|
||||
}
|
||||
}
|
||||
CardsList.add(card);
|
||||
return true;
|
||||
|
||||
return card;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1,12 +1,11 @@
|
|||
package north.gatherercrawler;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import north.gatherercrawler.util.CardsList;
|
||||
import north.gatherercrawler.util.ParseQueue;
|
||||
import north.gatherercrawler.util.ParsedList;
|
||||
import org.jsoup.Connection;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
|
|
@ -2,8 +2,8 @@ package north.gatherercrawler;
|
|||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.FileWriter;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
import north.gatherercrawler.util.CardsList;
|
||||
|
||||
/**
|
||||
*
|
||||
|
@ -13,16 +13,35 @@ public class ThreadStarter extends Thread {
|
|||
|
||||
private static Integer threadsDone = 0;
|
||||
private final Integer threads = 10;
|
||||
private List<Card> sortedCards;
|
||||
|
||||
public static synchronized void threadDone() {
|
||||
threadsDone++;
|
||||
}
|
||||
|
||||
private void updateSortedCards() {
|
||||
if (sortedCards == null) {
|
||||
sortedCards = new ArrayList<Card>();
|
||||
Iterator<Card> iterator = CardsList.iterator();
|
||||
while (iterator.hasNext()) {
|
||||
sortedCards.add(iterator.next());
|
||||
}
|
||||
|
||||
Collections.sort(sortedCards, new Comparator<Card>() {
|
||||
|
||||
public int compare(Card o1, Card o2) {
|
||||
int expansionCompare = o1.getExpansion().compareTo(o2.getExpansion());
|
||||
return expansionCompare != 0 ? expansionCompare : o1.getCardNumber().compareTo(o2.getCardNumber());
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private void writeCardsToFile() {
|
||||
try {
|
||||
FileWriter fstream = new FileWriter("cards-data.txt");
|
||||
BufferedWriter out = new BufferedWriter(fstream);
|
||||
Iterator<Card> iterator = CardsList.iterator();
|
||||
Iterator<Card> iterator = sortedCards.iterator();
|
||||
while (iterator.hasNext()) {
|
||||
out.write(iterator.next().toString());
|
||||
out.newLine();
|
||||
|
@ -37,7 +56,7 @@ public class ThreadStarter extends Thread {
|
|||
try {
|
||||
FileWriter fstream = new FileWriter("mtg-cards-data.txt");
|
||||
BufferedWriter out = new BufferedWriter(fstream);
|
||||
Iterator<Card> iterator = CardsList.iterator();
|
||||
Iterator<Card> iterator = sortedCards.iterator();
|
||||
while (iterator.hasNext()) {
|
||||
Card card = iterator.next();
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
@ -79,7 +98,7 @@ public class ThreadStarter extends Thread {
|
|||
} else {
|
||||
sb.append("||");
|
||||
}
|
||||
|
||||
|
||||
List<String> cardText = card.getCardText();
|
||||
for (int i = 0; i < cardText.size(); i++) {
|
||||
sb.append(cardText.get(i));
|
||||
|
@ -114,6 +133,7 @@ public class ThreadStarter extends Thread {
|
|||
}
|
||||
}
|
||||
|
||||
updateSortedCards();
|
||||
writeCardsToFile();
|
||||
writeCardsToUtilFile();
|
||||
}
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
package north.gatherercrawler;
|
||||
package north.gatherercrawler.util;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.ConcurrentSkipListSet;
|
||||
import north.gatherercrawler.Card;
|
||||
|
||||
/**
|
||||
*
|
|
@ -1,4 +1,4 @@
|
|||
package north.gatherercrawler;
|
||||
package north.gatherercrawler.util;
|
||||
|
||||
import java.util.concurrent.ConcurrentLinkedQueue;
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package north.gatherercrawler;
|
||||
package north.gatherercrawler.util;
|
||||
|
||||
import java.util.concurrent.ConcurrentSkipListSet;
|
||||
|
Loading…
Reference in a new issue