2011-07-23 16:18:37 +00:00
|
|
|
package north.gatherercrawler;
|
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.List;
|
2011-08-05 20:11:25 +00:00
|
|
|
import java.util.regex.Matcher;
|
|
|
|
import java.util.regex.Pattern;
|
2011-07-23 16:18:37 +00:00
|
|
|
import org.jsoup.Connection;
|
|
|
|
import org.jsoup.Jsoup;
|
|
|
|
import org.jsoup.nodes.Document;
|
|
|
|
import org.jsoup.nodes.Element;
|
|
|
|
import org.jsoup.select.Elements;
|
|
|
|
|
|
|
|
/**
|
|
|
|
*
|
2011-08-28 14:04:21 +00:00
|
|
|
* @author North
|
2011-07-23 16:18:37 +00:00
|
|
|
*/
|
|
|
|
public class CardParser extends Thread {
|
|
|
|
|
2011-08-05 20:11:25 +00:00
|
|
|
private static final Pattern patternPrint = Pattern.compile("(?<=#)[\\w\\d]+?(?= )");
|
|
|
|
private static final Pattern patternUrl = Pattern.compile("(?<=/)[\\w\\d]+?(?=\\.html)");
|
|
|
|
|
2011-07-23 16:18:37 +00:00
|
|
|
private boolean parseCard(Integer multiverseId) {
|
|
|
|
String url = "http://gatherer.wizards.com/Pages/Card/Details.aspx?multiverseid=" + multiverseId;
|
|
|
|
Card card = new Card(multiverseId);
|
|
|
|
Document doc = null;
|
|
|
|
int retries = 30;
|
|
|
|
boolean done = false;
|
|
|
|
while (retries > 0 && !done) {
|
|
|
|
try {
|
|
|
|
Connection connection = Jsoup.connect(url);
|
|
|
|
connection.timeout(20000);
|
|
|
|
doc = connection.get();
|
|
|
|
} catch (IOException ex) {
|
|
|
|
}
|
|
|
|
done = true;
|
|
|
|
}
|
|
|
|
if (!done) {
|
|
|
|
System.out.println("Card get exception: " + multiverseId);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
try {
|
2011-09-25 08:54:24 +00:00
|
|
|
Elements select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContentHeader_subtitleDisplay");
|
|
|
|
String cardName = "";
|
|
|
|
String selectorModifier = "";
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
cardName = select.get(0).text().trim();
|
|
|
|
}
|
|
|
|
|
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow .value");
|
2011-07-23 16:18:37 +00:00
|
|
|
if (!select.isEmpty()) {
|
|
|
|
card.setName(select.get(0).text().trim());
|
2011-09-25 08:54:24 +00:00
|
|
|
} else {
|
|
|
|
card.setName(cardName);
|
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ctl05_nameRow .value");
|
|
|
|
if (!select.isEmpty() && select.get(0).text().trim().equals(cardName)) {
|
|
|
|
selectorModifier = "_ctl05";
|
|
|
|
} else {
|
|
|
|
selectorModifier = "_ctl06";
|
|
|
|
}
|
2011-07-23 16:18:37 +00:00
|
|
|
}
|
|
|
|
|
2011-09-25 08:54:24 +00:00
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_manaRow .value img");
|
2011-07-23 16:18:37 +00:00
|
|
|
List<String> manaCost = new ArrayList<String>();
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
for (Element element : select) {
|
|
|
|
manaCost.add(element.attr("src").replace("/Handlers/Image.ashx?size=medium&name=", "").replace("&type=symbol", "").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", ""));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
card.setManaCost(manaCost);
|
|
|
|
|
2011-09-25 08:54:24 +00:00
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_cmcRow .value");
|
2011-07-23 16:18:37 +00:00
|
|
|
if (!select.isEmpty()) {
|
|
|
|
card.setConvertedManaCost(Integer.parseInt(select.get(0).text().trim()));
|
|
|
|
}
|
|
|
|
|
2011-09-25 08:54:24 +00:00
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_typeRow .value");
|
2011-07-23 16:18:37 +00:00
|
|
|
if (!select.isEmpty()) {
|
|
|
|
card.setTypes(select.get(0).text().trim());
|
|
|
|
}
|
|
|
|
|
2011-09-25 08:54:24 +00:00
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_textRow .value .cardtextbox");
|
2011-07-23 16:18:37 +00:00
|
|
|
List<String> cardText = new ArrayList<String>();
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
for (Element element : select) {
|
2011-08-28 14:04:21 +00:00
|
|
|
cardText.add(element.html().trim().replace("<img src=\"/Handlers/Image.ashx?size=small&name=", "{").replace("&type=symbol", "}").replaceAll("\" alt=\"[\\d\\w\\s]+?\" align=\"absbottom\" />", "").replace("\n", "").replace(""", "\""));
|
2011-07-23 16:18:37 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
card.setCardText(cardText);
|
|
|
|
|
2011-09-25 08:54:24 +00:00
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_FlavorText .cardtextbox");
|
2011-08-28 14:04:21 +00:00
|
|
|
List<String> flavorText = new ArrayList<String>();
|
2011-07-23 16:18:37 +00:00
|
|
|
if (!select.isEmpty()) {
|
2011-08-28 14:04:21 +00:00
|
|
|
for (Element element : select) {
|
2011-09-11 19:58:18 +00:00
|
|
|
flavorText.add(element.html().trim().replace(""", "\"").replace("<i>", "").replace("</i>", ""));
|
2011-08-28 14:04:21 +00:00
|
|
|
}
|
2011-07-23 16:18:37 +00:00
|
|
|
}
|
2011-08-28 14:04:21 +00:00
|
|
|
card.setFlavorText(flavorText);
|
2011-07-23 16:18:37 +00:00
|
|
|
|
2011-09-25 08:54:24 +00:00
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ptRow .value");
|
2011-07-23 16:18:37 +00:00
|
|
|
if (!select.isEmpty()) {
|
|
|
|
card.setPowerToughness(select.get(0).text().trim());
|
|
|
|
}
|
|
|
|
|
2011-09-25 08:54:24 +00:00
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_currentSetSymbol a");
|
2011-07-23 16:18:37 +00:00
|
|
|
if (!select.isEmpty()) {
|
|
|
|
card.setExpansion(select.get(1).text().trim());
|
|
|
|
}
|
|
|
|
|
2011-09-25 08:54:24 +00:00
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_rarityRow .value span");
|
2011-07-23 16:18:37 +00:00
|
|
|
if (!select.isEmpty()) {
|
|
|
|
card.setRarity(select.get(0).text().trim());
|
|
|
|
}
|
|
|
|
|
2011-09-25 08:54:24 +00:00
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_otherSetsValue a");
|
2011-07-23 16:18:37 +00:00
|
|
|
List<Integer> otherSets = new ArrayList<Integer>();
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
for (Element element : select) {
|
|
|
|
otherSets.add(Integer.parseInt(element.attr("href").replace("Details.aspx?multiverseid=", "")));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// card.setOtherSets(otherSets);
|
|
|
|
for (Integer otherSet : otherSets) {
|
|
|
|
if (!ParsedList.contains(otherSet)) {
|
|
|
|
ParseQueue.add(otherSet);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-09-25 08:54:24 +00:00
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_numberRow .value");
|
2011-07-23 16:18:37 +00:00
|
|
|
if (!select.isEmpty()) {
|
|
|
|
card.setCardNumber(select.get(0).text().trim());
|
|
|
|
}
|
|
|
|
|
2011-09-25 08:54:24 +00:00
|
|
|
select = doc.select("#ctl00_ctl00_ctl00_MainContent_SubContent_SubContent" + selectorModifier + "_ArtistCredit a");
|
2011-07-23 16:18:37 +00:00
|
|
|
if (!select.isEmpty()) {
|
|
|
|
card.setArtist(select.get(0).text().trim());
|
|
|
|
}
|
|
|
|
} catch (Exception e) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2011-08-05 20:11:25 +00:00
|
|
|
if (card.getCardNumber() == null) {
|
|
|
|
url = "http://magiccards.info/query?q=" + card.getName().replace(' ', '+');
|
|
|
|
try {
|
|
|
|
Connection connection = Jsoup.connect(url);
|
|
|
|
connection.timeout(20000);
|
|
|
|
doc = connection.get();
|
|
|
|
|
|
|
|
Elements select = doc.select("small a:contains(" + card.getExpansion() + ")");
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
Matcher matcher = patternUrl.matcher(select.get(0).attr("href"));
|
|
|
|
matcher.find();
|
|
|
|
card.setCardNumber(matcher.group());
|
|
|
|
} else {
|
|
|
|
select = doc.select("small b:contains(#)");
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
Matcher matcher = patternPrint.matcher(select.get(0).html());
|
|
|
|
matcher.find();
|
|
|
|
card.setCardNumber(matcher.group());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
} catch (IOException ex) {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (card.getCardNumber() == null) {
|
2011-08-28 14:04:21 +00:00
|
|
|
Elements select = doc.select("p a:contains(" + card.getExpansion() + ")");
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
Matcher matcher = patternUrl.matcher(select.get(0).attr("href"));
|
|
|
|
matcher.find();
|
|
|
|
card.setCardNumber(matcher.group());
|
|
|
|
} else {
|
|
|
|
select = doc.select("p b:contains(#)");
|
|
|
|
if (!select.isEmpty()) {
|
|
|
|
Matcher matcher = patternPrint.matcher(select.get(0).html());
|
|
|
|
matcher.find();
|
|
|
|
card.setCardNumber(matcher.group());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (card.getCardNumber() == null) {
|
|
|
|
System.out.println("Card number missing: " + card.getName());
|
|
|
|
}
|
2011-08-05 20:11:25 +00:00
|
|
|
}
|
2011-07-23 16:18:37 +00:00
|
|
|
CardsList.add(card);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public void run() {
|
|
|
|
while (!ParseQueue.isEmpty()) {
|
|
|
|
Integer multiverseId = ParseQueue.remove();
|
|
|
|
if (!ParsedList.contains(multiverseId)) {
|
|
|
|
ParsedList.add(multiverseId);
|
|
|
|
parseCard(multiverseId);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ThreadStarter.threadDone();
|
|
|
|
}
|
|
|
|
}
|