/*
 * Copyright 2009-2010 the Fess Project and the Others.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 */
package jp.sf.fess.transformer;

import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import jp.sf.fess.Constants;
import jp.sf.fess.db.exentity.CrawlingConfig;
import jp.sf.fess.helper.CrawlingConfigHelper;
import jp.sf.fess.helper.CrawlingSessionHelper;
import jp.sf.fess.helper.PathMappingHelper;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.seasar.framework.container.SingletonS2Container;
import org.seasar.framework.util.SerializeUtil;
import org.seasar.framework.util.StringUtil;
import org.seasar.robot.RobotCrawlAccessException;
import org.seasar.robot.RobotSystemException;
import org.seasar.robot.db.cbean.AccessResultDataCB;
import org.seasar.robot.db.exbhv.AccessResultDataBhv;
import org.seasar.robot.entity.AccessResultData;
import org.seasar.robot.entity.ExtractData;
import org.seasar.robot.entity.ResponseData;
import org.seasar.robot.entity.ResultData;
import org.seasar.robot.entity.UrlQueue;
import org.seasar.robot.extractor.Extractor;
import org.seasar.robot.util.CrawlingParameterUtil;
import org.seasar.robot.util.LruHashMap;

public abstract class AbstractFessFileTransformer extends
        AbstractFessXpathTransformer {

    public String encoding = null;

    public String noTitleLabel = "No title.";

    public int abbreviationMarginLength = 10;

    public boolean ignoreEmptyContent = false;

    public int maxTitleLength = 100;

    public int maxDigestLength = 200;

    public boolean appendMetaContentToContent = true;

    public boolean appendBodyContentToContent = true;

    public Map<String, String> parentEncodingMap = Collections
            .synchronizedMap(new LruHashMap<String, String>(1000));

    protected abstract Extractor getExtractor(ResponseData responseData);

    protected void putResultDataBody(Map<String, Object> dataMap, String key,
            Object value) {
        dataMap.put(key, value);
    }

    @Override
    public ResultData transform(ResponseData responseData) {
        if (responseData == null || responseData.getResponseBody() == null) {
            throw new RobotCrawlAccessException("No response body.");
        }

        Extractor extractor = getExtractor(responseData);
        InputStream in = responseData.getResponseBody();
        Map<String, String> params = new HashMap<String, String>();
        params
                .put(ExtractData.RESOURCE_NAME_KEY,
                        getResourceName(responseData));
        params.put(ExtractData.CONTENT_TYPE, responseData.getMimeType());
        StringBuilder contentBuf = new StringBuilder(1000);
        StringBuilder contentMetaBuf = new StringBuilder(1000);
        try {
            ExtractData extractData = extractor.getText(in, params);
            if (ignoreEmptyContent
                    && StringUtil.isBlank(extractData.getContent())) {
                return null;
            }
            contentBuf.append(extractData.getContent());
            // meta
            for (String key : extractData.getKeySet()) {
                String[] values = extractData.getValues(key);
                if (values != null) {
                    contentMetaBuf.append(StringUtils.join(values, ' '));
                }
            }
        } catch (Exception e) {
            RobotCrawlAccessException rcae = new RobotCrawlAccessException(
                    "Could not get a text from " + responseData.getUrl(), e);
            rcae.setLogLevel(RobotCrawlAccessException.WARN);
            throw rcae;
        } finally {
            IOUtils.closeQuietly(in);
        }
        String content = contentBuf.toString();
        String contentMeta = contentMetaBuf.toString();

        if (StringUtil.isBlank(content)) {
            return null;
        }

        ResultData resultData = new ResultData();
        resultData.setTransformerName(getName());

        CrawlingSessionHelper crawlingSessionHelper = SingletonS2Container
                .getComponent("crawlingSessionHelper");
        String sessionId = crawlingSessionHelper
                .getCanonicalSessionId(responseData.getSessionId());
        PathMappingHelper pathMappingHelper = SingletonS2Container
                .getComponent("pathMappingHelper");
        String url = pathMappingHelper.replaceUrl(sessionId, responseData
                .getUrl());

        Map<String, Object> dataMap = new HashMap<String, Object>();

        // segment
        putResultDataBody(dataMap, "segment", sessionId);
        // content
        StringBuilder buf = new StringBuilder();
        if (appendBodyContentToContent) {
            buf.append(content);
        }
        if (appendMetaContentToContent) {
            buf.append(contentMeta);
        }
        String body = normalizeContent(buf.toString());
        if (StringUtil.isNotBlank(body)) {
            putResultDataBody(dataMap, "content", body);
        } else {
            putResultDataBody(dataMap, "content", "");
        }
        // cache 
        String cache = normalizeContent(content);
        putResultDataBody(dataMap, "cache", cache);
        // digest
        putResultDataBody(dataMap, "digest", Constants.DIGEST_PREFIX
                + abbreviate(cache, maxDigestLength));
        // title
        if (url.endsWith("/")) {
            if (StringUtil.isNotBlank(content)) {
                putResultDataBody(dataMap, "title", abbreviate(body,
                        maxTitleLength));
            } else {
                putResultDataBody(dataMap, "title", noTitleLabel);
            }
        } else {
            String u = decodeUrl(url);
            int pos = u.lastIndexOf('/');
            if (pos == -1) {
                putResultDataBody(dataMap, "title", u);
            } else {
                putResultDataBody(dataMap, "title", u.substring(pos + 1));
            }
        }
        // host
        putResultDataBody(dataMap, "host", getHost(url));
        // site
        putResultDataBody(dataMap, "site", getSite(url, responseData
                .getCharSet()));
        // url
        putResultDataBody(dataMap, "url", url);
        // tstamp
        putResultDataBody(dataMap, "tstamp", Long
                .toString(new Date().getTime()));
        // TODO anchor
        putResultDataBody(dataMap, "anchor", "");
        // mimetype
        putResultDataBody(dataMap, "mimetype", responseData.getMimeType());
        // contentLength
        putResultDataBody(dataMap, "contentLength", Long.toString(responseData
                .getContentLength()));
        //  lastModified
        putResultDataBody(dataMap, "lastModified", Long.toString(responseData
                .getLastModified().getTime()));
        // config
        CrawlingConfigHelper crawlingConfigHelper = SingletonS2Container
                .getComponent("crawlingConfigHelper");
        CrawlingConfig crawlingConfig = crawlingConfigHelper
                .getCrawlingConfig(responseData.getSessionId());
        // indexingTarget
        putResultDataBody(dataMap, Constants.INDEXING_TARGET, crawlingConfig
                .getIndexingTarget(url));
        //  boost
        putResultDataBody(dataMap, "boost", crawlingConfig.getDocumentBoost());
        // type: browserType
        List<String> browserTypeList = new ArrayList<String>();
        for (String browserType : crawlingConfig.getBrowserTypeValues()) {
            browserTypeList.add(browserType);
        }
        putResultDataBody(dataMap, "type", browserTypeList);
        // label: labelType
        List<String> labelTypeList = new ArrayList<String>();
        for (String labelType : crawlingConfig.getLabelTypeValues()) {
            labelTypeList.add(labelType);
        }
        putResultDataBody(dataMap, "label", labelTypeList);
        // role: roleType
        List<String> roleTypeList = new ArrayList<String>();
        for (String roleType : crawlingConfig.getRoleTypeValues()) {
            roleTypeList.add(roleType);
        }
        putResultDataBody(dataMap, "role", roleTypeList);
        // TODO date
        // TODO lang
        // id
        putResultDataBody(dataMap, "id", crawlingSessionHelper
                .generateId(dataMap));

        try {
            resultData.setData(SerializeUtil.fromObjectToBinary(dataMap));
        } catch (Exception e) {
            throw new RobotCrawlAccessException("Could not serialize object: "
                    + url, e);
        }
        resultData.setEncoding(charsetName);

        return resultData;
    }

    protected String abbreviate(String str, int maxWidth) {
        String newStr = StringUtils.abbreviate(str, maxWidth);
        try {
            if (newStr.getBytes(Constants.UTF_8).length > maxWidth
                    + abbreviationMarginLength) {
                newStr = StringUtils.abbreviate(str, maxWidth / 2);
            }
        } catch (UnsupportedEncodingException e) {
            // NOP
        }
        return newStr;
    }

    private String getResourceName(ResponseData responseData) {
        String name = responseData.getUrl();
        String enc = responseData.getCharSet();

        if (name == null || enc == null) {
            return null;
        }

        name = name.replaceAll("/+$", "");
        int idx = name.lastIndexOf("/");
        if (idx >= 0) {
            name = name.substring(idx + 1);
        }
        try {
            return URLDecoder.decode(name, enc);
        } catch (UnsupportedEncodingException e) {
            return name;
        }
    }

    protected String decodeUrl(String url) {
        if (url == null) {
            return null;
        }

        if (encoding == null) {
            UrlQueue urlQueue = CrawlingParameterUtil.getUrlQueue();
            if (urlQueue != null) {
                String parentUrl = urlQueue.getParentUrl();
                if (StringUtil.isNotEmpty(parentUrl)) {
                    String sessionId = urlQueue.getSessionId();
                    String encoding = getParentEncoding(parentUrl, sessionId);
                    if (encoding != null) {
                        try {
                            return URLDecoder.decode(url, encoding);
                        } catch (UnsupportedEncodingException e) {
                        }
                    }
                }
            }
        } else {
            try {
                return URLDecoder.decode(url, encoding);
            } catch (UnsupportedEncodingException e) {
            }
        }

        try {
            return URLDecoder.decode(url, Constants.UTF_8);
        } catch (UnsupportedEncodingException e) {
            return url;
        }
    }

    protected String getParentEncoding(String parentUrl, String sessionId) {
        String key = sessionId + ":" + parentUrl;
        String encoding = parentEncodingMap.get(key);
        if (encoding != null) {
            return encoding;
        }

        AccessResultDataCB cb = new AccessResultDataCB();
        cb.query().queryAccessResult().setSessionId_Equal(sessionId);
        cb.query().queryAccessResult().setUrl_Equal(parentUrl);
        cb.specify().columnEncoding();
        AccessResultData accessResultData = SingletonS2Container.getComponent(
                AccessResultDataBhv.class).selectEntity(cb);
        if (accessResultData != null && accessResultData.getEncoding() != null) {
            encoding = accessResultData.getEncoding();
            parentEncodingMap.put(key, encoding);
            return encoding;
        }
        return null;
    }

    protected String getHost(String url) {
        if (StringUtil.isBlank(url)) {
            return ""; // empty
        }

        if (url.startsWith("file:////")) {
            String value = decodeUrl(url.substring(9));
            int pos = value.indexOf('/');
            if (pos > 0) {
                return value.substring(0, pos);
            } else if (pos == -1) {
                return value;
            } else {
                return "localhost";
            }
        } else if (url.startsWith("file:")) {
            return "localhost";
        }

        return super.getHost(url);
    }

    protected String getSite(String url, String encoding) {
        if (StringUtil.isBlank(url)) {
            return ""; // empty
        }

        if (url.startsWith("file:////")) {
            String value = decodeUrl(url.substring(9));
            return StringUtils.abbreviate("\\\\" + value.replace('/', '\\'),
                    maxSiteLength);
        } else if (url.startsWith("file:")) {
            String value = decodeUrl(url.substring(5));
            if (value.length() > 2 && value.charAt(2) == ':') {
                // Windows
                return StringUtils.abbreviate(value.substring(1).replace('/',
                        '\\'), maxSiteLength);
            } else {
                // Unix
                return StringUtils.abbreviate(value, maxSiteLength);
            }
        }

        return super.getSite(url, encoding);
    }

    @Override
    public Object getData(AccessResultData accessResultData) {
        byte[] data = accessResultData.getData();
        if (data != null) {
            try {
                return SerializeUtil.fromBinaryToObject(data);
            } catch (Exception e) {
                throw new RobotSystemException(
                        "Could not create an instanced from bytes.", e);
            }
        }
        return new HashMap<String, Object>();
    }
}
